In [1]:
import os
import sys
from pathlib import Path
from clang.cindex import Config
import clang.cindex

def find_libclang():
    # Try multiple possible locations for libclang.dll
    possible_paths = [
        # Anaconda path
        Path(sys.prefix) / 'Library' / 'bin' / 'libclang.dll',
        Path(sys.prefix) / 'clang' / 'native' / 'libclang.dll',
        # pip installed path
        Path(sys.prefix) / 'Lib' / 'site-packages' / 'clang' / 'native' / 'libclang.dll',
        # System paths
        Path('C:/Program Files/LLVM/bin/libclang.dll'),
    ]
    
    for path in possible_paths:
        if path.exists():
            return str(path)
    return None

# Find and configure libclang.dll
libclang_path = find_libclang()
if libclang_path:
    Config.set_library_file(libclang_path)
else:
    print("Warning: Could not find libclang.dll in any of the expected locations")
    print("Please install LLVM and make sure it's in your PATH")
    sys.exit(1)

# Import required libraries
import re
import json
import csv
import logging
from typing import Dict, List, Set, Tuple, Optional
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import concurrent.futures
from functools import partial
import numpy as np

# For AST parsing
from clang.cindex import Index, CursorKind, TokenKind, TranslationUnit
import clang.cindex

# For similarity metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from datasketch import MinHash, MinHashLSH
from zss import simple_distance, Node

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('plagiarism_detector.log'),
        logging.StreamHandler()
    ]
)

class TokenProcessor:
    # Define token significance levels
    TOKEN_SIGNIFICANCE = {
        TokenKind.KEYWORD: 1.0,      # High significance
        TokenKind.IDENTIFIER: 0.8,
        TokenKind.LITERAL: 0.7,
        TokenKind.PUNCTUATION: 0.2,   # Low significance
        TokenKind.COMMENT: 0.1       # Very low significance
    }
    
    # Tokens to filter out completely
    IGNORED_TOKENS = {';', '{', '}', '(', ')', ','}
    
    @classmethod
    def filter_tokens(cls, tokens: List[Tuple[TokenKind, str]]) -> List[Tuple[TokenKind, str, float]]:
        """Filter and weight tokens based on their significance."""
        filtered_tokens = []
        
        for token_kind, token_text in tokens:
            # Skip ignored tokens
            if token_text in cls.IGNORED_TOKENS:
                continue
                
            # Get token significance
            significance = cls.TOKEN_SIGNIFICANCE.get(token_kind, 0.5)
            
            # Add to filtered list with significance weight
            filtered_tokens.append((token_kind, token_text, significance))
        
        return filtered_tokens
    
    @staticmethod
    def normalize_identifiers(tokens: List[Tuple[TokenKind, str, float]]) -> List[Tuple[TokenKind, str, float]]:
        """Normalize identifiers while preserving their role."""
        id_map = {}
        normalized_tokens = []
        
        for token_kind, token_text, significance in tokens:
            if token_kind == TokenKind.IDENTIFIER:
                if token_text not in id_map:
                    id_map[token_text] = f'ID_{len(id_map)}'
                normalized_tokens.append((token_kind, id_map[token_text], significance))
            else:
                normalized_tokens.append((token_kind, token_text, significance))
        
        return normalized_tokens

    @staticmethod
    def extract_cpp_patterns(tokens):
        """Extract C++ specific patterns from tokens."""
        patterns = {
            'memory_management': [],
            'template_usage': [],
            'stl_usage': [],
            'pointer_usage': []
        }
        
        # Look for memory management patterns
        for i, (kind, text, _) in enumerate(tokens):
            if text in ['new', 'delete', 'malloc', 'free', 'alloc']:
                patterns['memory_management'].append((i, text))
            elif text in ['template', 'typename', 'class'] and kind == TokenKind.KEYWORD:
                patterns['template_usage'].append((i, text))
            elif text in ['vector', 'map', 'set', 'list', 'queue', 'stack', 'array', 'deque']:
                patterns['stl_usage'].append((i, text))
            elif text in ['*', '&', '->', 'nullptr', 'NULL']:
                patterns['pointer_usage'].append((i, text))
        
        return patterns

class ASTAnalyzer:
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.index = Index.create()
        self.ast_graph = nx.DiGraph()
        self.subtree_hashes = {}
        self.cfg_graph = nx.DiGraph()  # Control Flow Graph
        
        try:
            self.tu = self.index.parse(
                file_path, 
                args=['-x', 'c++', '-std=c++14'],
                options=TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD
            )
        except Exception as e:
            logging.error(f"Failed to parse {file_path}: {str(e)}")
            raise
    
    def _hash_subtree(self, node, memo=None) -> int:
        """Compute a hash for the subtree rooted at node."""
        if memo is None:
            memo = {}
            
        if node in memo:
            return memo[node]
        
        # Create a string representation of the node
        node_str = f"{node.kind}_{node.spelling}"
        
        # Recursively hash children
        child_hashes = []
        for child in node.get_children():
            child_hash = self._hash_subtree(child, memo)
            child_hashes.append(child_hash)
        
        # Combine node and child hashes
        combined = node_str + '_'.join(map(str, sorted(child_hashes)))
        result = hash(combined)
        
        memo[node] = result
        return result
    
    def _to_zss_tree(self, node) -> Node:
        """Convert AST node to format suitable for Zhang-Shasha algorithm."""
        children = [self._to_zss_tree(child) for child in node.get_children()]
        return Node(f"{node.kind}_{node.spelling}", children)
    
    def _process_node(self, node, parent_id=None, depth=0) -> str:
        """Process an AST node and build the graph structure."""
        node_id = f"{node.kind}_{depth}_{len(self.ast_graph)}"
        
        # Store node information
        self.ast_graph.add_node(
            node_id,
            kind=node.kind.name,
            spelling=node.spelling,
            type=str(node.type.spelling if node.type else ""),
            depth=depth,
            hash=self._hash_subtree(node)
        )
        
        if parent_id:
            self.ast_graph.add_edge(parent_id, node_id)
        
        # Process children
        for child in node.get_children():
            self._process_node(child, node_id, depth + 1)
        
        return node_id
    
    def _build_control_flow_graph(self, node, parent_block=None):
        """Build a control flow graph from AST."""
        if node.kind in [CursorKind.FUNCTION_DECL, CursorKind.CXX_METHOD]:
            # Create entry block for function
            entry_block = f"entry_{node.spelling}"
            self.cfg_graph.add_node(entry_block, type="entry", ast_node=node)
            
            current_block = entry_block
            
            # Process function body
            for child in node.get_children():
                if child.kind == CursorKind.COMPOUND_STMT:
                    self._process_compound_stmt(child, current_block)
        
    def _process_compound_stmt(self, node, parent_block):
        """Process a compound statement for CFG."""
        current_block = parent_block
        
        for child in node.get_children():
            if child.kind == CursorKind.IF_STMT:
                # Create condition block
                cond_block = f"if_cond_{len(self.cfg_graph)}"
                self.cfg_graph.add_node(cond_block, type="condition", ast_node=child)
                self.cfg_graph.add_edge(current_block, cond_block)
                
                # Process then and else branches
                then_block = None
                else_block = None
                
                for branch in child.get_children():
                    if branch.kind == CursorKind.COMPOUND_STMT:
                        if not then_block:
                            then_block = f"then_{len(self.cfg_graph)}"
                            self.cfg_graph.add_node(then_block, type="then", ast_node=branch)
                            self.cfg_graph.add_edge(cond_block, then_block)
                            self._process_compound_stmt(branch, then_block)
                        else:
                            else_block = f"else_{len(self.cfg_graph)}"
                            self.cfg_graph.add_node(else_block, type="else", ast_node=branch)
                            self.cfg_graph.add_edge(cond_block, else_block)
                            self._process_compound_stmt(branch, else_block)
                
                # Create merge block
                merge_block = f"merge_{len(self.cfg_graph)}"
                self.cfg_graph.add_node(merge_block, type="merge")
                
                if then_block:
                    self.cfg_graph.add_edge(then_block, merge_block)
                if else_block:
                    self.cfg_graph.add_edge(else_block, merge_block)
                else:
                    self.cfg_graph.add_edge(cond_block, merge_block)
                
                current_block = merge_block
                
            elif child.kind == CursorKind.WHILE_STMT or child.kind == CursorKind.FOR_STMT:
                # Create loop header block
                loop_header = f"loop_header_{len(self.cfg_graph)}"
                self.cfg_graph.add_node(loop_header, type="loop_header", ast_node=child)
                self.cfg_graph.add_edge(current_block, loop_header)
                
                # Create loop body block
                loop_body = f"loop_body_{len(self.cfg_graph)}"
                self.cfg_graph.add_node(loop_body, type="loop_body")
                self.cfg_graph.add_edge(loop_header, loop_body)
                
                # Process loop body
                for branch in child.get_children():
                    if branch.kind == CursorKind.COMPOUND_STMT:
                        self._process_compound_stmt(branch, loop_body)
                
                # Loop back to header
                self.cfg_graph.add_edge(loop_body, loop_header)
                
                # Create exit block
                loop_exit = f"loop_exit_{len(self.cfg_graph)}"
                self.cfg_graph.add_node(loop_exit, type="loop_exit")
                self.cfg_graph.add_edge(loop_header, loop_exit)
                
                current_block = loop_exit
            else:
                # Regular statement
                stmt_block = f"stmt_{len(self.cfg_graph)}"
                self.cfg_graph.add_node(stmt_block, type="statement", ast_node=child)
                self.cfg_graph.add_edge(current_block, stmt_block)
                current_block = stmt_block
    
    def _identify_loops(self, node, loops=None):
        """Identify loops in the AST."""
        if loops is None:
            loops = []
            
        if node.kind in [CursorKind.WHILE_STMT, CursorKind.FOR_STMT, CursorKind.DO_STMT]:
            loops.append(node)
            
        for child in node.get_children():
            self._identify_loops(child, loops)
            
        return loops
    
    def _calculate_cyclomatic_complexity(self):
        """Calculate cyclomatic complexity from the CFG."""
        if not self.cfg_graph:
            return 1  # Default for empty graph
        
        # M = E - N + 2P where E is edges, N is nodes, P is connected components
        edges = len(self.cfg_graph.edges())
        nodes = len(self.cfg_graph.nodes())
        components = nx.number_connected_components(self.cfg_graph.to_undirected())
        
        return edges - nodes + 2 * components
    
    def _estimate_time_complexity(self, loops):
        """Estimate time complexity based on loop nesting."""
        if not loops:
            return "O(1)"
            
        # Check for nested loops
        max_nesting = 0
        current_nesting = 0
        visited = set()
        
        for loop in loops:
            if loop in visited:
                continue
                
            current_nesting = 1
            parent = loop.semantic_parent
            while parent and parent.kind != CursorKind.TRANSLATION_UNIT:
                if parent.kind in [CursorKind.WHILE_STMT, CursorKind.FOR_STMT, CursorKind.DO_STMT]:
                    current_nesting += 1
                parent = parent.semantic_parent
                
            max_nesting = max(max_nesting, current_nesting)
            visited.add(loop)
            
        if max_nesting == 1:
            return "O(n)"
        elif max_nesting == 2:
            return "O(n²)"
        elif max_nesting == 3:
            return "O(n³)"
        elif max_nesting > 3:
            return f"O(n^{max_nesting})"
        else:
            return "O(1)"

    def analyze(self) -> Dict:
        """Perform complete AST analysis with subtree matching."""
        try:
            root_id = self._process_node(self.tu.cursor)
            zss_tree = self._to_zss_tree(self.tu.cursor)
            
            # Build control flow graph
            self._build_control_flow_graph(self.tu.cursor)
            
            # Identify loops for complexity analysis
            loops = self._identify_loops(self.tu.cursor)
            
            # Calculate structural metrics
            max_depth = max(d['depth'] for _, d in self.ast_graph.nodes(data=True))
            node_types = {d['kind'] for _, d in self.ast_graph.nodes(data=True)}
            subtree_hashes = {n: d['hash'] for n, d in self.ast_graph.nodes(data=True)}
            
            # Calculate complexity metrics
            cyclomatic_complexity = self._calculate_cyclomatic_complexity()
            time_complexity = self._estimate_time_complexity(loops)
            
            # Safely calculate max nesting
            max_nesting = 1  # Default value
            if loops:
                nesting_values = [1]  # Start with default
                for node in loops:
                    try:
                        # Safely get token count if available
                        if node.get_definition() and hasattr(node.get_definition().get_tokens(), 'count'):
                            nesting_values.append(node.get_definition().get_tokens().count)
                    except:
                        pass  # Skip if we can't get tokens
                max_nesting = max(nesting_values)
            
            return {
                'ast_depth': max_depth,
                'node_count': len(self.ast_graph),
                'node_types': list(node_types),
                'subtree_hashes': subtree_hashes,
                'zss_tree': zss_tree,
                'graph_structure': {
                    'edges': len(self.ast_graph.edges()),
                    'avg_branching': len(self.ast_graph.edges()) / max(1, len(self.ast_graph.nodes())),
                    'leaf_nodes': sum(1 for n in self.ast_graph.nodes() 
                                    if self.ast_graph.out_degree(n) == 0)
                },
                'control_flow': {
                    'cfg_nodes': len(self.cfg_graph.nodes()),
                    'cfg_edges': len(self.cfg_graph.edges()),
                    'cyclomatic_complexity': cyclomatic_complexity
                },
                'complexity': {
                    'time_complexity': time_complexity,
                    'loop_count': len(loops),
                    'max_nesting': max_nesting
                }
            }
            
        except Exception as e:
            logging.error(f"Error in AST analysis for {self.file_path}: {str(e)}")
            raise
     
        
class ScalableSimilarityAnalyzer:
    def __init__(self, num_perm: int = 128, threshold: float = 0.8):
        self.num_perm = num_perm
        self.threshold = threshold
        self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
        self.minhashes = {}
    
    def _create_minhash(self, tokens: List[str]) -> MinHash:
        """Create MinHash signature for a sequence of tokens."""
        m = MinHash(num_perm=self.num_perm)
        for token in tokens:
            m.update(token.encode('utf-8'))
        return m
    
    def add_submission(self, file_id: str, tokens: List[str]):
        """Add a submission to the LSH index."""
        minhash = self._create_minhash(tokens)
        self.minhashes[file_id] = minhash
        self.lsh.insert(file_id, minhash)
    
    def find_similar(self, file_id: str) -> Set[str]:
        """Find similar submissions using LSH."""
        return self.lsh.query(self.minhashes[file_id])
    
    @staticmethod
    def compute_tree_distance(tree1: Node, tree2: Node) -> float:
        """Compute normalized tree edit distance with optimizations."""
        # Use memoization for better performance
        memo = {}
        
        def tree_size(node):
            if hasattr(node, '_size'):
                return node._size
            size = 1 + sum(tree_size(child) for child in node.children)
            node._size = size
            return size
        
        # Pre-compute sizes
        size1 = tree_size(tree1)
        size2 = tree_size(tree2)
        
        # If sizes are very different, we can short-circuit
        if abs(size1 - size2) / max(size1, size2) > 0.5:
            return 0.0
        
        # Compute actual distance
        distance = simple_distance(tree1, tree2)
        max_size = max(size1, size2)
        
        # Normalize
        return 1 - (distance / max_size)
    
    @staticmethod
    def compute_cfg_similarity(cfg1: nx.DiGraph, cfg2: nx.DiGraph) -> float:
        """Compute similarity between control flow graphs."""
        if not cfg1.nodes() or not cfg2.nodes():
            return 0.0
            
        # Compare basic graph properties
        nodes_sim = min(len(cfg1), len(cfg2)) / max(len(cfg1), len(cfg2))
        edges_sim = min(len(cfg1.edges()), len(cfg2.edges())) / max(len(cfg1.edges()), len(cfg2.edges())) if max(len(cfg1.edges()), len(cfg2.edges())) > 0 else 1.0
        
        # Compare node types distribution
        types1 = {}
        types2 = {}
        
        for _, data in cfg1.nodes(data=True):
            node_type = data.get('type', 'unknown')
            types1[node_type] = types1.get(node_type, 0) + 1
            
        for _, data in cfg2.nodes(data=True):
            node_type = data.get('type', 'unknown')
            types2[node_type] = types2.get(node_type, 0) + 1
            
        # Jaccard similarity of node types
        all_types = set(types1.keys()) | set(types2.keys())
        type_sim = sum(min(types1.get(t, 0), types2.get(t, 0)) for t in all_types) / sum(max(types1.get(t, 0), types2.get(t, 0)) for t in all_types) if all_types else 0
        
        # Weighted combination
        return 0.3 * nodes_sim + 0.3 * edges_sim + 0.4 * type_sim
    
    @staticmethod
    def compute_complexity_similarity(comp1: Dict, comp2: Dict) -> float:
        """Compare algorithmic complexity signatures."""
        # Time complexity comparison
        time_match = comp1['time_complexity'] == comp2['time_complexity']
        
        # Loop count similarity
        loop_sim = 1 - abs(comp1['loop_count'] - comp2['loop_count']) / max(comp1['loop_count'], comp2['loop_count']) if max(comp1['loop_count'], comp2['loop_count']) > 0 else 1.0
        
        # Nesting depth similarity
        nesting_sim = 1 - abs(comp1['max_nesting'] - comp2['max_nesting']) / max(comp1['max_nesting'], comp2['max_nesting']) if max(comp1['max_nesting'], comp2['max_nesting']) > 0 else 1.0
        
        # Weighted combination
        return 0.5 * (1 if time_match else 0) + 0.25 * loop_sim + 0.25 * nesting_sim
    
    @staticmethod
    def compute_weighted_similarity(submission1: Dict, submission2: Dict) -> Dict:
        """Compute comprehensive similarity with weighted metrics."""
        # Tree edit distance similarity
        tree_sim = ScalableSimilarityAnalyzer.compute_tree_distance(
            submission1['ast_features']['zss_tree'],
            submission2['ast_features']['zss_tree']
        )
        
        # Subtree hash similarity
        hashes1 = set(submission1['ast_features']['subtree_hashes'].values())
        hashes2 = set(submission2['ast_features']['subtree_hashes'].values())
        hash_sim = len(hashes1.intersection(hashes2)) / len(hashes1.union(hashes2))
        
        # Structure similarity
        struct1 = submission1['ast_features']['graph_structure']
        struct2 = submission2['ast_features']['graph_structure']
        struct_sim = 1 - abs(struct1['avg_branching'] - struct2['avg_branching']) / \
                     max(struct1['avg_branching'], struct2['avg_branching'])
        
        # Control flow similarity
        cfg_sim = ScalableSimilarityAnalyzer.compute_cfg_similarity(
            submission1['ast_features'].get('control_flow', {}).get('cfg', nx.DiGraph()),
            submission2['ast_features'].get('control_flow', {}).get('cfg', nx.DiGraph())
        )
        
        # Complexity similarity
        complexity_sim = ScalableSimilarityAnalyzer.compute_complexity_similarity(
            submission1['ast_features'].get('complexity', {'time_complexity': 'O(1)', 'loop_count': 0, 'max_nesting': 0}),
            submission2['ast_features'].get('complexity', {'time_complexity': 'O(1)', 'loop_count': 0, 'max_nesting': 0})
        )
        
        # Weighted combination with enhanced weights
        weights = {
            'tree': 0.25,
            'hash': 0.25,
            'structure': 0.1,
            'cfg': 0.25,
            'complexity': 0.15
        }
        
        overall_sim = (
            weights['tree'] * tree_sim +
            weights['hash'] * hash_sim +
            weights['structure'] * struct_sim +
            weights['cfg'] * cfg_sim +
            weights['complexity'] * complexity_sim
        )
        
        return {
            'tree_edit_similarity': tree_sim,
            'subtree_hash_similarity': hash_sim,
            'structure_similarity': struct_sim,
            'cfg_similarity': cfg_sim,
            'complexity_similarity': complexity_sim,
            'overall_similarity': overall_sim
        }

class EnhancedPlagiarismDetector:
    def __init__(self, output_dir: str = 'plagiarism_results', 
                 similarity_threshold: float = 0.8,
                 max_workers: int = None):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.similarity_threshold = similarity_threshold
        self.max_workers = max_workers or os.cpu_count()
        
        self.submissions = {}
        self.comparison_results = []
        self.lsh_analyzer = ScalableSimilarityAnalyzer(threshold=similarity_threshold)
        
        # Set up logging for this instance
        self.logger = logging.getLogger(f"PlagiarismDetector_{id(self)}")
    
    def _analyze_single_file(self, file_path: str) -> Optional[Dict]:
        """Analyze a single file with comprehensive error handling."""
        try:
            self.logger.info(f"Analyzing {file_path}")
            
            # Parse and analyze AST
            ast_analyzer = ASTAnalyzer(file_path)
            ast_features = ast_analyzer.analyze()
            
            # Process tokens
            tokens = [(t.kind, t.spelling) for t in ast_analyzer.tu.cursor.get_tokens()]
            filtered_tokens = TokenProcessor.filter_tokens(tokens)
            normalized_tokens = TokenProcessor.normalize_identifiers(filtered_tokens)
            
            # Extract C++ specific patterns
            cpp_patterns = TokenProcessor.extract_cpp_patterns(normalized_tokens)
            
            results = {
                'file_path': file_path,
                'ast_features': ast_features,
                'token_features': {
                    'original': tokens,
                    'filtered': filtered_tokens,
                    'normalized': normalized_tokens,
                    'cpp_patterns': cpp_patterns
                },
                'analysis_timestamp': datetime.now().isoformat()
            }
            
            # Add to LSH index
            token_strings = [t[1] for t in normalized_tokens]
            self.lsh_analyzer.add_submission(file_path, token_strings)
            
            return results
            
        except Exception as e:
            self.logger.error(f"Error analyzing {file_path}: {str(e)}", exc_info=True)
            return None
    
    def analyze_files(self, file_paths: List[str]):
        """Analyze multiple files in parallel."""
        self.logger.info(f"Starting analysis of {len(file_paths)} files")
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            future_to_file = {executor.submit(self._analyze_single_file, fp): fp 
                             for fp in file_paths}
            
            for future in concurrent.futures.as_completed(future_to_file):
                file_path = future_to_file[future]
                try:
                    result = future.result()
                    if result:
                        self.submissions[file_path] = result
                except Exception as e:
                    self.logger.error(f"Analysis failed for {file_path}: {str(e)}")
    
    def find_similarities(self):
        """Find similar submissions using LSH and detailed comparison."""
        self.logger.info("Starting similarity analysis")
        self.comparison_results = []
        
        for file_path in self.submissions:
            # Find candidate matches using LSH
            candidates = self.lsh_analyzer.find_similar(file_path)
            
            for candidate in candidates:
                if candidate <= file_path:  # Avoid duplicate comparisons
                    continue
                    
                # Detailed similarity analysis
                similarity = self.lsh_analyzer.compute_weighted_similarity(
                    self.submissions[file_path],
                    self.submissions[candidate]
                )
                
                if similarity['overall_similarity'] >= self.similarity_threshold:
                    self.comparison_results.append({
                        'file1': file_path,
                        'file2': candidate,
                        'similarity_metrics': similarity,
                        'timestamp': datetime.now().isoformat()
                    })



if __name__ == "__main__":
    # Initialize detector with adjusted threshold
    detector = EnhancedPlagiarismDetector(
        similarity_threshold=0.5,  # Lower threshold to catch more matches
        max_workers=os.cpu_count()
    )
    
    # Path to submissions
    submissions_dir = r"C:\Users\hp\Desktop\Pixel\Plagiarism_Detection_System\src\B2017"
    
    # Get all C/C++ files
    cpp_files = []
    for root, _, files in os.walk(submissions_dir):
        for file in files:
            if file.endswith(('.cpp', '.c')):
                cpp_files.append(os.path.join(root, file))
    
    print(f"Found {len(cpp_files)} C/C++ files")
    
    # Analyze files
    detector.analyze_files(cpp_files)
    
    print(f"Successfully analyzed {len(detector.submissions)} files")
    
    # Override the similarity calculation method with a more accurate version
    def improved_similarity(submission1, submission2):
        """Compute improved similarity with better weights for identical files."""
        try:
            # Direct token comparison (most reliable for exact matches)
            tokens1 = [t[1] for t in submission1['token_features']['normalized']]
            tokens2 = [t[1] for t in submission2['token_features']['normalized']]
            
            # Calculate token-based similarity
            common_tokens = set(tokens1) & set(tokens2)
            all_tokens = set(tokens1) | set(tokens2)
            token_sim = len(common_tokens) / len(all_tokens) if all_tokens else 0
            
            # Calculate sequence similarity (for exact matches)
            min_len = min(len(tokens1), len(tokens2))
            max_len = max(len(tokens1), len(tokens2))
            
            # Check for sequence matches
            matches = 0
            for i in range(min_len):
                if tokens1[i] == tokens2[i]:
                    matches += 1
            
            sequence_sim = matches / max_len if max_len > 0 else 0
            
            # Tree edit distance similarity
            tree_sim = ScalableSimilarityAnalyzer.compute_tree_distance(
                submission1['ast_features']['zss_tree'],
                submission2['ast_features']['zss_tree']
            )
            
            # Subtree hash similarity
            hashes1 = set(submission1['ast_features']['subtree_hashes'].values())
            hashes2 = set(submission2['ast_features']['subtree_hashes'].values())
            hash_sim = len(hashes1.intersection(hashes2)) / len(hashes1.union(hashes2)) if hashes1 or hashes2 else 0
            
            # Structure similarity
            struct1 = submission1['ast_features']['graph_structure']
            struct2 = submission2['ast_features']['graph_structure']
            struct_sim = 1 - abs(struct1['avg_branching'] - struct2['avg_branching']) / \
                         max(struct1['avg_branching'], struct2['avg_branching']) if max(struct1['avg_branching'], struct2['avg_branching']) > 0 else 1.0
            
            # Heavily weighted combination favoring token and sequence similarity
            overall_sim = 0.4 * token_sim + 0.3 * sequence_sim + 0.15 * tree_sim + 0.1 * hash_sim + 0.05 * struct_sim
            
            # Confidence score based on multiple metrics agreement
            metrics = [token_sim, sequence_sim, tree_sim, hash_sim, struct_sim]
            avg = sum(metrics) / len(metrics)
            variance = sum((m - avg) ** 2 for m in metrics) / len(metrics)
            confidence = 1 - (variance * 2)  # Lower variance means higher confidence
            
            return {
                'token_similarity': token_sim,
                'sequence_similarity': sequence_sim,
                'tree_edit_similarity': tree_sim,
                'subtree_hash_similarity': hash_sim,
                'structure_similarity': struct_sim,
                'overall_similarity': overall_sim,
                'confidence': max(0, min(1, confidence))  # Clamp between 0 and 1
            }
        except Exception as e:
            print(f"Error in similarity calculation: {e}")
            # Fallback to basic similarity
            return {
                'overall_similarity': 0.0,
                'tree_edit_similarity': 0.0,
                'subtree_hash_similarity': 0.0,
                'structure_similarity': 0.0,
                'confidence': 0.0
            }
    
    # Replace the similarity calculation method
    detector.lsh_analyzer.compute_weighted_similarity = staticmethod(improved_similarity)
    
    # Find similarities
    detector.find_similarities()
    
    print(f"Found {len(detector.comparison_results)} potential similarity matches")
    
    # Save results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save CSV summary
    csv_file = detector.output_dir / f'summary_{timestamp}.csv'
    with open(csv_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([
            'File 1', 'File 2', 'Overall Similarity',
            'Token Similarity', 'Sequence Similarity',
            'Tree Edit Similarity', 'Subtree Hash Similarity',
            'Structure Similarity', 'Confidence'
        ])
        
        for result in detector.comparison_results:
            metrics = result['similarity_metrics']
            writer.writerow([
                os.path.basename(result['file1']),
                os.path.basename(result['file2']),
                metrics['overall_similarity'],
                metrics.get('token_similarity', 'N/A'),
                metrics.get('sequence_similarity', 'N/A'),
                metrics['tree_edit_similarity'],
                metrics['subtree_hash_similarity'],
                metrics['structure_similarity'],
                metrics.get('confidence', 'N/A')
            ])
    
    # Print summary
    print(f"\nAnalysis complete! Results saved to: {csv_file}")
    
    if detector.comparison_results:
        print(f"\nFound {len(detector.comparison_results)} potential cases of plagiarism:")
        
        # Sort by similarity
        sorted_results = sorted(
            detector.comparison_results,
            key=lambda x: x['similarity_metrics']['overall_similarity'],
            reverse=True
        )
        
        for i, result in enumerate(sorted_results[:10]):  # Show top 10
            metrics = result['similarity_metrics']
            print(f"\n{i+1}. Files:")
            print(f"   - {os.path.basename(result['file1'])}")
            print(f"   - {os.path.basename(result['file2'])}")
            print(f"   Overall Similarity: {metrics['overall_similarity']:.2%}")
            if 'token_similarity' in metrics:
                print(f"   Token Similarity: {metrics['token_similarity']:.2%}")
            if 'sequence_similarity' in metrics:
                print(f"   Sequence Similarity: {metrics['sequence_similarity']:.2%}")
            print(f"   Tree Edit Similarity: {metrics['tree_edit_similarity']:.2%}")
            print(f"   Subtree Hash Similarity: {metrics['subtree_hash_similarity']:.2%}")
            print(f"   Structure Similarity: {metrics['structure_similarity']:.2%}")
            if 'confidence' in metrics:
                print(f"   Confidence: {metrics['confidence']:.2%}")
            
            # Check for exact duplicates
            if metrics['overall_similarity'] > 0.95:
                print("   ⚠️ EXACT DUPLICATE DETECTED ⚠️")
        
        if len(sorted_results) > 10:
            print(f"\n... and {len(sorted_results) - 10} more cases (see CSV file for details)")
    else:
        print("\nNo suspicious similarities found.")

In [3]:
import os
import sys
import tkinter as tk
from tkinter import ttk, filedialog, scrolledtext, messagebox
from datetime import datetime
import threading
import csv
from pathlib import Path

class PlagiarismDetectorGUI:
    def __init__(self, root, notebook_module):
        self.root = root
        self.root.title("Plagiarism Detection System")
        self.root.geometry("800x600")
        self.root.minsize(800, 600)
        
        # Import notebook module with all the classes
        self.nb_module = notebook_module
        
        # Set up the main frame
        self.main_frame = ttk.Frame(root, padding="10")
        self.main_frame.pack(fill=tk.BOTH, expand=True)
        
        # Create tabs
        self.tab_control = ttk.Notebook(self.main_frame)
        
        # File comparison tab
        self.file_tab = ttk.Frame(self.tab_control)
        self.tab_control.add(self.file_tab, text="File Comparison")
        
        # Bulk analysis tab
        self.bulk_tab = ttk.Frame(self.tab_control)
        self.tab_control.add(self.bulk_tab, text="Bulk Analysis")
        
        self.tab_control.pack(fill=tk.BOTH, expand=True)
        
        # Set up the file comparison tab
        self.setup_file_comparison_tab()
        
        # Set up the bulk analysis tab
        self.setup_bulk_analysis_tab()
        
        # Status bar
        self.status_var = tk.StringVar()
        self.status_var.set("Ready")
        self.status_bar = ttk.Label(root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W)
        self.status_bar.pack(side=tk.BOTTOM, fill=tk.X)
        
        # Progress bar
        self.progress = ttk.Progressbar(root, orient=tk.HORIZONTAL, length=100, mode='indeterminate')
        self.progress.pack(side=tk.BOTTOM, fill=tk.X, padx=10, pady=5)
        
        # Initialize the detector
        self.detector = None
        self.initialize_detector()
    
    def initialize_detector(self):
        try:
            # Create an optimized detector using classes from the notebook
            self.detector = self.nb_module.EnhancedPlagiarismDetector(
                similarity_threshold=0.7,
                max_workers=os.cpu_count()
            )
            
            # Define optimized similarity calculation
            def optimized_similarity(submission1, submission2):
                try:
                    # Direct content comparison for exact matches
                    file1 = submission1['file_path']
                    file2 = submission2['file_path']
                    
                    try:
                        with open(file1, 'r', encoding='utf-8', errors='ignore') as f:
                            content1 = ''.join(f.read().split())
                        with open(file2, 'r', encoding='utf-8', errors='ignore') as f:
                            content2 = ''.join(f.read().split())
                            
                        if content1 == content2:
                            return {
                                'direct_match': 1.0,
                                'token_similarity': 1.0,
                                'tree_edit_similarity': 1.0,
                                'subtree_hash_similarity': 1.0,
                                'structure_similarity': 1.0,
                                'overall_similarity': 1.0
                            }
                    except:
                        pass  # Continue with other similarity metrics
                    
                    # Token comparison
                    tokens1 = [t[1] for t in submission1['token_features']['normalized']]
                    tokens2 = [t[1] for t in submission2['token_features']['normalized']]
                    
                    # Calculate token similarity
                    common_tokens = set(tokens1) & set(tokens2)
                    all_tokens = set(tokens1) | set(tokens2)
                    token_sim = len(common_tokens) / len(all_tokens) if all_tokens else 0
                    
                    # Calculate sequence similarity
                    min_len = min(len(tokens1), len(tokens2))
                    max_len = max(len(tokens1), len(tokens2))
                    matches = sum(1 for i in range(min_len) if tokens1[i] == tokens2[i])
                    sequence_sim = matches / max_len if max_len > 0 else 0
                    
                    # Tree edit distance
                    tree_sim = self.nb_module.ScalableSimilarityAnalyzer.compute_tree_distance(
                        submission1['ast_features']['zss_tree'],
                        submission2['ast_features']['zss_tree']
                    )
                    
                    # Subtree hash similarity
                    hashes1 = set(submission1['ast_features']['subtree_hashes'].values())
                    hashes2 = set(submission2['ast_features']['subtree_hashes'].values())
                    hash_sim = len(hashes1.intersection(hashes2)) / len(hashes1.union(hashes2)) if hashes1 or hashes2 else 0
                    
                    # Structure similarity
                    struct1 = submission1['ast_features']['graph_structure']
                    struct2 = submission2['ast_features']['graph_structure']
                    struct_sim = 1 - abs(struct1['avg_branching'] - struct2['avg_branching']) / \
                                max(struct1['avg_branching'], struct2['avg_branching']) if max(struct1['avg_branching'], struct2['avg_branching']) > 0 else 1.0
                    
                    # Weighted combination
                    overall_sim = 0.3 * token_sim + 0.1 * sequence_sim + 0.3 * tree_sim + 0.2 * hash_sim + 0.1 * struct_sim
                    
                    # Calculate confidence based on agreement of metrics
                    metrics = [token_sim, tree_sim, hash_sim, struct_sim]
                    avg = sum(metrics) / len(metrics)
                    variance = sum((m - avg) ** 2 for m in metrics) / len(metrics)
                    confidence = 1 - (variance * 2)  # Lower variance means higher confidence
                    confidence = max(0, min(1, confidence))  # Clamp to [0,1]
                    
                    return {
                        'token_similarity': token_sim,
                        'sequence_similarity': sequence_sim,
                        'tree_edit_similarity': tree_sim,
                        'subtree_hash_similarity': hash_sim,
                        'structure_similarity': struct_sim,
                        'overall_similarity': overall_sim,
                        'confidence': confidence
                    }
                except Exception as e:
                    print(f"Error in similarity calculation: {e}")
                    return {
                        'overall_similarity': 0.0,
                        'tree_edit_similarity': 0.0,
                        'subtree_hash_similarity': 0.0,
                        'structure_similarity': 0.0,
                        'confidence': 0.0
                    }
            
            # Override the similarity calculation
            self.detector.lsh_analyzer.compute_weighted_similarity = staticmethod(optimized_similarity)
            self.status_var.set("Detector initialized successfully")
            
        except Exception as e:
            self.status_var.set(f"Initialization error: {str(e)}")
            print(f"Initialization error: {e}")


In [4]:
# GUI for Plagiarism Detection System
def create_gui():
    import tkinter as tk
    from tkinter import ttk, filedialog, scrolledtext, messagebox
    import threading
    import csv
    from pathlib import Path
    import os
    
    # Create the main window
    root = tk.Tk()
    root.title("Plagiarism Detection System")
    root.geometry("900x700")
    root.minsize(800, 600)
    
    # Create a detector instance
    detector = EnhancedPlagiarismDetector(
        similarity_threshold=0.5,
        max_workers=os.cpu_count()
    )
    
    # Add improved similarity method
    def improved_similarity(submission1, submission2):
        """Compute improved similarity with better weights for identical files."""
        try:
            # Direct token comparison (most reliable for exact matches)
            tokens1 = [t[1] for t in submission1['token_features']['normalized']]
            tokens2 = [t[1] for t in submission2['token_features']['normalized']]
            
            # Calculate token-based similarity
            common_tokens = set(tokens1) & set(tokens2)
            all_tokens = set(tokens1) | set(tokens2)
            token_sim = len(common_tokens) / len(all_tokens) if all_tokens else 0
            
            # Calculate sequence similarity (for exact matches)
            min_len = min(len(tokens1), len(tokens2))
            max_len = max(len(tokens1), len(tokens2))
            
            # Check for sequence matches
            matches = 0
            for i in range(min_len):
                if tokens1[i] == tokens2[i]:
                    matches += 1
            
            sequence_sim = matches / max_len if max_len > 0 else 0
            
            # Tree edit distance similarity
            tree_sim = ScalableSimilarityAnalyzer.compute_tree_distance(
                submission1['ast_features']['zss_tree'],
                submission2['ast_features']['zss_tree']
            )
            
            # Subtree hash similarity
            hashes1 = set(submission1['ast_features']['subtree_hashes'].values())
            hashes2 = set(submission2['ast_features']['subtree_hashes'].values())
            hash_sim = len(hashes1.intersection(hashes2)) / len(hashes1.union(hashes2)) if hashes1 or hashes2 else 0
            
            # Structure similarity
            struct1 = submission1['ast_features']['graph_structure']
            struct2 = submission2['ast_features']['graph_structure']
            struct_sim = 1 - abs(struct1['avg_branching'] - struct2['avg_branching']) / \
                         max(struct1['avg_branching'], struct2['avg_branching']) if max(struct1['avg_branching'], struct2['avg_branching']) > 0 else 1.0
            
            # Heavily weighted combination favoring token and sequence similarity
            overall_sim = 0.4 * token_sim + 0.3 * sequence_sim + 0.15 * tree_sim + 0.1 * hash_sim + 0.05 * struct_sim
            
            # Confidence score based on multiple metrics agreement
            metrics = [token_sim, sequence_sim, tree_sim, hash_sim, struct_sim]
            avg = sum(metrics) / len(metrics)
            variance = sum((m - avg) ** 2 for m in metrics) / len(metrics)
            confidence = 1 - (variance * 2)  # Lower variance means higher confidence
            
            return {
                'token_similarity': token_sim,
                'sequence_similarity': sequence_sim,
                'tree_edit_similarity': tree_sim,
                'subtree_hash_similarity': hash_sim,
                'structure_similarity': struct_sim,
                'overall_similarity': overall_sim,
                'confidence': max(0, min(1, confidence))  # Clamp between 0 and 1
            }
        except Exception as e:
            print(f"Error in similarity calculation: {e}")
            # Fallback to basic similarity
            return {
                'overall_similarity': 0.0,
                'tree_edit_similarity': 0.0,
                'subtree_hash_similarity': 0.0,
                'structure_similarity': 0.0,
                'confidence': 0.0
            }
    
    # Replace the similarity calculation method
    detector.lsh_analyzer.compute_weighted_similarity = staticmethod(improved_similarity)
    
    # Main frame with notebook (tabs)
    main_frame = ttk.Frame(root, padding=10)
    main_frame.pack(fill=tk.BOTH, expand=True)
    
    notebook = ttk.Notebook(main_frame)
    notebook.pack(fill=tk.BOTH, expand=True)
    
    # File comparison tab
    file_tab = ttk.Frame(notebook)
    notebook.add(file_tab, text="File Comparison")
    
    # Directory analysis tab
    dir_tab = ttk.Frame(notebook)
    notebook.add(dir_tab, text="Directory Analysis")
    
    # Status bar and progress bar
    status_var = tk.StringVar(value="Ready")
    progress = ttk.Progressbar(root, orient="horizontal", mode="indeterminate")
    progress.pack(side=tk.BOTTOM, fill=tk.X, padx=10, pady=5)
    status_bar = ttk.Label(root, textvariable=status_var, relief=tk.SUNKEN, anchor=tk.W)
    status_bar.pack(side=tk.BOTTOM, fill=tk.X)
    
    # ===== File Comparison Tab =====
    file_frame = ttk.LabelFrame(file_tab, text="Select Files", padding=10)
    file_frame.pack(fill=tk.X, padx=10, pady=10)
    
    # File 1
    file1_var = tk.StringVar()
    ttk.Label(file_frame, text="File 1:").grid(row=0, column=0, sticky=tk.W, pady=5)
    ttk.Entry(file_frame, textvariable=file1_var, width=60).grid(row=0, column=1, padx=5, pady=5)
    
    def browse_file1():
        filename = filedialog.askopenfilename(filetypes=[("C/C++ Files", "*.c;*.cpp"), ("All Files", "*.*")])
        if filename:
            file1_var.set(filename)
    
    ttk.Button(file_frame, text="Browse", command=browse_file1).grid(row=0, column=2, padx=5, pady=5)
    
    # File 2
    file2_var = tk.StringVar()
    ttk.Label(file_frame, text="File 2:").grid(row=1, column=0, sticky=tk.W, pady=5)
    ttk.Entry(file_frame, textvariable=file2_var, width=60).grid(row=1, column=1, padx=5, pady=5)
    
    def browse_file2():
        filename = filedialog.askopenfilename(filetypes=[("C/C++ Files", "*.c;*.cpp"), ("All Files", "*.*")])
        if filename:
            file2_var.set(filename)
    
    ttk.Button(file_frame, text="Browse", command=browse_file2).grid(row=1, column=2, padx=5, pady=5)
    
    # Results area
    results_frame = ttk.LabelFrame(file_tab, text="Comparison Results", padding=10)
    results_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
    
    # Similarity meter
    similarity_frame = ttk.Frame(results_frame)
    similarity_frame.pack(fill=tk.X, padx=5, pady=10)
    
    similarity_label_var = tk.StringVar(value="Similarity: 0.00%")
    similarity_label = ttk.Label(similarity_frame, textvariable=similarity_label_var, font=("Arial", 12, "bold"))
    similarity_label.pack(side=tk.TOP, pady=5)
    
    similarity_meter = ttk.Progressbar(similarity_frame, orient="horizontal", length=300, mode="determinate")
    similarity_meter.pack(side=tk.TOP, pady=5)
    
    # Detailed results
    results_text = scrolledtext.ScrolledText(results_frame, wrap=tk.WORD, height=20)
    results_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
    results_text.config(state=tk.DISABLED)
    
    def compare_files():
        file1 = file1_var.get()
        file2 = file2_var.get()
        
        if not file1 or not file2:
            messagebox.showerror("Error", "Please select two files to compare")
            return
        
        if not os.path.exists(file1) or not os.path.exists(file2):
            messagebox.showerror("Error", "One or both selected files do not exist")
            return
        
        # Clear previous results
        results_text.config(state=tk.NORMAL)
        results_text.delete(1.0, tk.END)
        results_text.config(state=tk.DISABLED)
        
        # Update UI
        status_var.set("Comparing files...")
        progress.start()
        
        def run_comparison():
            try:
                # Reset detector for clean comparison
                detector.submissions = {}
                
                # Analyze files
                detector.analyze_files([file1, file2])
                
                # Check if files were analyzed successfully
                if file1 not in detector.submissions or file2 not in detector.submissions:
                    root.after(0, lambda: update_status("Error: Failed to analyze one or both files"))
                    return
                
                # Calculate similarity
                similarity = detector.lsh_analyzer.compute_weighted_similarity(
                    detector.submissions[file1],
                    detector.submissions[file2]
                )
                
                # Update UI with results
                root.after(0, lambda: display_results(file1, file2, similarity))
                
            except Exception as e:
                root.after(0, lambda: update_status(f"Error: {str(e)}"))
        
        # Run comparison in a separate thread
        threading.Thread(target=run_comparison, daemon=True).start()
    
    def display_results(file1, file2, similarity):
        # Stop progress and update status
        progress.stop()
        status_var.set("Comparison complete")
        
        # Update similarity meter
        overall_sim = similarity['overall_similarity'] * 100
        similarity_label_var.set(f"Similarity: {overall_sim:.2f}%")
        similarity_meter['value'] = overall_sim
        
        # Update detailed results
        results_text.config(state=tk.NORMAL)
        
        # File info
        results_text.insert(tk.END, f"File 1: {os.path.basename(file1)}\n")
        results_text.insert(tk.END, f"File 2: {os.path.basename(file2)}\n\n")
        
        # Overall similarity with color coding
        results_text.insert(tk.END, "Overall Similarity: ")
        if overall_sim >= 90:
            results_text.insert(tk.END, f"{overall_sim:.2f}%\n", "high")
            results_text.insert(tk.END, "⚠️ HIGH PROBABILITY OF PLAGIARISM ⚠️\n\n", "warning")
        elif overall_sim >= 70:
            results_text.insert(tk.END, f"{overall_sim:.2f}%\n", "medium")
            results_text.insert(tk.END, "⚠️ SUSPICIOUS SIMILARITY DETECTED ⚠️\n\n", "warning")
        else:
            results_text.insert(tk.END, f"{overall_sim:.2f}%\n", "low")
        
        # Add confidence if available
        if 'confidence' in similarity:
            confidence = similarity['confidence'] * 100
            results_text.insert(tk.END, f"Confidence: {confidence:.2f}%\n\n")
        
        # Detailed metrics
        results_text.insert(tk.END, "Detailed Metrics:\n")
        
        if 'token_similarity' in similarity:
            token_sim = similarity['token_similarity'] * 100
            results_text.insert(tk.END, f"Token Similarity: {token_sim:.2f}%\n")
            
        if 'sequence_similarity' in similarity:
            seq_sim = similarity['sequence_similarity'] * 100
            results_text.insert(tk.END, f"Sequence Similarity: {seq_sim:.2f}%\n")
            
        if 'tree_edit_similarity' in similarity:
            tree_sim = similarity['tree_edit_similarity'] * 100
            results_text.insert(tk.END, f"Tree Edit Similarity: {tree_sim:.2f}%\n")
            
        if 'subtree_hash_similarity' in similarity:
            hash_sim = similarity['subtree_hash_similarity'] * 100
            results_text.insert(tk.END, f"Subtree Hash Similarity: {hash_sim:.2f}%\n")
            
        if 'structure_similarity' in similarity:
            struct_sim = similarity['structure_similarity'] * 100
            results_text.insert(tk.END, f"Structure Similarity: {struct_sim:.2f}%\n")
        
        # Set up text tags for coloring
        results_text.tag_configure("high", foreground="red", font=("Arial", 11, "bold"))
        results_text.tag_configure("medium", foreground="orange", font=("Arial", 11, "bold"))
        results_text.tag_configure("low", foreground="green", font=("Arial", 11, "bold"))
        results_text.tag_configure("warning", foreground="red", font=("Arial", 12, "bold"))
        
        results_text.config(state=tk.DISABLED)
    
    # Compare button
    ttk.Button(file_frame, text="Compare Files", command=compare_files).grid(row=2, column=1, pady=10)
    
    # ===== Directory Analysis Tab =====
    dir_frame = ttk.LabelFrame(dir_tab, text="Directory Analysis", padding=10)
    dir_frame.pack(fill=tk.X, padx=10, pady=10)
    
    # Directory path
    dir_path_var = tk.StringVar()
    ttk.Label(dir_frame, text="Directory:").grid(row=0, column=0, sticky=tk.W, pady=5)
    ttk.Entry(dir_frame, textvariable=dir_path_var, width=60).grid(row=0, column=1, padx=5, pady=5)
    
    def browse_dir():
        dirname = filedialog.askdirectory()
        if dirname:
            dir_path_var.set(dirname)
    
    ttk.Button(dir_frame, text="Browse", command=browse_dir).grid(row=0, column=2, padx=5, pady=5)
    
    # Options frame
    options_frame = ttk.LabelFrame(dir_frame, text="Options")
    options_frame.grid(row=1, column=0, columnspan=3, sticky=tk.EW, padx=5, pady=5)
    
    # Threshold
    ttk.Label(options_frame, text="Similarity Threshold:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=5)
    threshold_var = tk.DoubleVar(value=0.5)
    threshold_spinbox = ttk.Spinbox(options_frame, from_=0.1, to=1.0, increment=0.05, textvariable=threshold_var, width=5)
    threshold_spinbox.grid(row=0, column=1, padx=5, pady=5)
    
    # Max files
    ttk.Label(options_frame, text="Max Files (0 for all):").grid(row=0, column=2, sticky=tk.W, padx=15, pady=5)
    max_files_var = tk.IntVar(value=0)
    max_files_spinbox = ttk.Spinbox(options_frame, from_=0, to=1000, increment=10, textvariable=max_files_var, width=5)
    max_files_spinbox.grid(row=0, column=3, padx=5, pady=5)
    
    # Results treeview
    results_tree_frame = ttk.LabelFrame(dir_tab, text="Results", padding=10)
    results_tree_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
    
    # Create treeview with scrollbars
    tree_columns = ("file1", "file2", "similarity", "confidence", "match_type")
    results_tree = ttk.Treeview(results_tree_frame, columns=tree_columns, show="headings")
    
    # Set column headings
    results_tree.heading("file1", text="File 1")
    results_tree.heading("file2", text="File 2")
    results_tree.heading("similarity", text="Similarity")
    results_tree.heading("confidence", text="Confidence")
    results_tree.heading("match_type", text="Match Type")
    
    # Set column widths
    results_tree.column("file1", width=180)
    results_tree.column("file2", width=180)
    results_tree.column("similarity", width=80, anchor=tk.CENTER)
    results_tree.column("confidence", width=80, anchor=tk.CENTER)
    results_tree.column("match_type", width=150, anchor=tk.CENTER)
    
    # Add scrollbars
    tree_vsb = ttk.Scrollbar(results_tree_frame, orient="vertical", command=results_tree.yview)
    tree_hsb = ttk.Scrollbar(results_tree_frame, orient="horizontal", command=results_tree.xview)
    results_tree.configure(yscrollcommand=tree_vsb.set, xscrollcommand=tree_hsb.set)
    
    # Grid layout for treeview and scrollbars
    results_tree.grid(row=0, column=0, sticky=tk.NSEW)
    tree_vsb.grid(row=0, column=1, sticky=tk.NS)
    tree_hsb.grid(row=1, column=0, sticky=tk.EW)
    
    # Configure grid weights
    results_tree_frame.columnconfigure(0, weight=1)
    results_tree_frame.rowconfigure(0, weight=1)
    
    # Configure tags for colors
    results_tree.tag_configure("high", background="#ffcccc")
    results_tree.tag_configure("medium", background="#ffffcc")
    results_tree.tag_configure("low", background="#ccffcc")
    
    def analyze_directory():
        directory = dir_path_var.get()
        
        if not directory or not os.path.isdir(directory):
            messagebox.showerror("Error", "Please select a valid directory")
            return
        
        # Clear previous results
        for item in results_tree.get_children():
            results_tree.delete(item)
        
        # Get options
        threshold = threshold_var.get()
        max_files = max_files_var.get()
        
        # Update UI
        status_var.set("Starting analysis...")
        progress.start()
        
        def run_analysis():
            try:
                # Reset detector
                detector.submissions = {}
                detector.comparison_results = []
                detector.similarity_threshold = threshold
                
                # Get all C/C++ files
                cpp_files = []
                for root_dir, _, files in os.walk(directory):
                    for file in files:
                        if file.endswith(('.cpp', '.c')):
                            cpp_files.append(os.path.join(root_dir, file))
        
                # Limit files if needed
                if max_files > 0 and len(cpp_files) > max_files:
                    cpp_files = cpp_files[:max_files]
                
                # Update status using root.after for thread safety
                file_count = len(cpp_files)
                root.after(0, lambda: status_var.set(f"Analyzing {file_count} files..."))
                
                # Analyze files
                detector.analyze_files(cpp_files)
                
                # Find similarities using root.after
                root.after(0, lambda: status_var.set("Finding similarities..."))
                detector.find_similarities()
                
                # Sort results by similarity
                sorted_results = sorted(
                    detector.comparison_results,
                    key=lambda x: x['similarity_metrics']['overall_similarity'],
                    reverse=True
                )
        
                # Send results to main thread using root.after
                root.after(0, lambda: display_directory_results(sorted_results))
                
            except Exception as e:
                # Report error using root.after
                root.after(0, lambda: update_status(f"Error: {str(e)}"))
        
        # Run analysis in a separate thread
        threading.Thread(target=run_analysis, daemon=True).start()
    
    def display_directory_results(sorted_results=None):
        # Stop progress
        progress.stop()
        
        # If no results were provided, use the detector's results
        if sorted_results is None:
            sorted_results = sorted(
                detector.comparison_results,
                key=lambda x: x['similarity_metrics']['overall_similarity'],
                reverse=True
            )
        
        # Add to treeview
        for result in sorted_results:
            metrics = result['similarity_metrics']
            overall_sim = metrics['overall_similarity'] * 100
            confidence = metrics.get('confidence', 0.5) * 100
                
            # Determine match type
            if overall_sim >= 90:
                match_type = "HIGH SIMILARITY"
            elif overall_sim >= 70:
                match_type = "SUSPICIOUS"
            else:
                match_type = "LOW SIMILARITY"
            
            # Insert into tree
            item_id = results_tree.insert("", tk.END, values=(
                os.path.basename(result['file1']),
                os.path.basename(result['file2']),
                f"{overall_sim:.2f}%",
                f"{confidence:.2f}%",
                match_type
            ))
            
            # Color based on similarity
            if overall_sim >= 90:
                results_tree.item(item_id, tags=("high",))
            elif overall_sim >= 70:
                results_tree.item(item_id, tags=("medium",))
            else:
                results_tree.item(item_id, tags=("low",))
        
        # Update status
        update_status(f"Analysis complete. Found {len(sorted_results)} potential matches.")
    
    def export_results():
        if not hasattr(detector, 'comparison_results') or not detector.comparison_results:
            messagebox.showinfo("Info", "No results to export")
            return
        
        # Ask for save location
        filename = filedialog.asksaveasfilename(
            defaultextension=".csv",
            filetypes=[("CSV files", "*.csv"), ("All files", "*.*")]
        )
        
        if not filename:
            return
            
        try:
            # Sort results by similarity
            sorted_results = sorted(
                detector.comparison_results,
                key=lambda x: x['similarity_metrics']['overall_similarity'],
                reverse=True
            )
            
            # Save to CSV
            with open(filename, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow([
                    'File 1', 'File 2', 'Overall Similarity',
                    'Token Similarity', 'Sequence Similarity',
                    'Tree Edit Similarity', 'Subtree Hash Similarity',
                    'Structure Similarity', 'Confidence'
                ])
                
                for result in sorted_results:
                    metrics = result['similarity_metrics']
                    writer.writerow([
                        os.path.basename(result['file1']),
                        os.path.basename(result['file2']),
                        f"{metrics['overall_similarity']:.4f}",
                        f"{metrics.get('token_similarity', 'N/A'):.4f}" if 'token_similarity' in metrics else 'N/A',
                        f"{metrics.get('sequence_similarity', 'N/A'):.4f}" if 'sequence_similarity' in metrics else 'N/A',
                        f"{metrics['tree_edit_similarity']:.4f}",
                        f"{metrics['subtree_hash_similarity']:.4f}",
                        f"{metrics['structure_similarity']:.4f}",
                        f"{metrics.get('confidence', 'N/A'):.4f}" if 'confidence' in metrics else 'N/A'
                    ])
            
            messagebox.showinfo("Success", f"Results exported to {filename}")
            
        except Exception as e:
            messagebox.showerror("Error", f"Failed to export results: {str(e)}")
    
    # Button frame for directory analysis
    dir_button_frame = ttk.Frame(dir_frame)
    dir_button_frame.grid(row=2, column=0, columnspan=3, pady=10)
    
    ttk.Button(dir_button_frame, text="Analyze Directory", command=analyze_directory).grid(row=0, column=0, padx=10)
    ttk.Button(dir_button_frame, text="Export Results", command=export_results).grid(row=0, column=1, padx=10)
    
    def update_status(message):
        status_var.set(message)
        if "complete" in message.lower() or "error" in message.lower():
            progress.stop()
    
    # Start the GUI
    root.mainloop()

# Add this line at the end of your script to launch the GUI
if __name__ == "__main__":
    create_gui()

2025-05-25 12:46:50,587 - INFO - Starting analysis of 5 files
2025-05-25 12:46:50,594 - INFO - Analyzing E:/DAA_Project/src/Samples\sample1.cpp
2025-05-25 12:46:50,598 - INFO - Analyzing E:/DAA_Project/src/Samples\sample2.cpp
2025-05-25 12:46:50,614 - INFO - Analyzing E:/DAA_Project/src/Samples\sample3.cpp
2025-05-25 12:46:50,617 - INFO - Analyzing E:/DAA_Project/src/Samples\sample4.cpp
2025-05-25 12:46:50,618 - INFO - Analyzing E:/DAA_Project/src/Samples\sample5.cpp
2025-05-25 12:46:51,405 - INFO - Starting similarity analysis
