In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import graphviz

In [2]:
# Common keywords and patterns for multiple languages
COMMON_KEYWORDS = {
    'python': ['i', 'j', 'k', 'n', 'return', 'for', 'if', 'else', 'while', 'in', 'len', 'range', 'print', 'True', 'False', 'def', 'self']
}

In [3]:
def getNodeLabel(node):
    # Determina il nome del nodo da visualizzare nel grafo
    label = type(node).__name__
    
    # Gestisci i casi speciali per visualizzare informazioni aggiuntive
    if isinstance(node, ast.Module):
        label = "Module"
    elif isinstance(node, ast.FunctionDef):
        label += f" (function: {node.name})"
    elif isinstance(node, ast.ClassDef):
        label += f" (class: {node.name})"
    elif isinstance(node, ast.arguments):
        arg_names = [arg.arg for arg in node.args]
        label += f" (args: {', '.join(arg_names)})"
    elif isinstance(node, ast.arg):
        label += f" (arg: {node.arg})"
    elif isinstance(node, ast.Return):
        label += " (return)"
    elif isinstance(node, ast.Assign):
        targets = [target.id for target in node.targets if isinstance(target, ast.Name)]
        label += f" (assign: {', '.join(targets)})"
    elif isinstance(node, ast.Name):
        label += f" (id: {node.id})"
    elif isinstance(node, ast.Constant):
        label += f" (value: {node.value})"
    elif isinstance(node, ast.BinOp):
        label += f" ({type(node.op).__name__})"
    elif isinstance(node, ast.Call):
        func_name = node.func.id if isinstance(node.func, ast.Name) else type(node.func).__name__
        label += f" (call: {func_name})"
    elif isinstance(node, ast.Attribute):
        label += f" (attribute: {node.attr})"
    elif isinstance(node, ast.Expr):
        label = "Expr"
    elif isinstance(node, ast.If):
        label += " (if statement)"
    elif isinstance(node, ast.For):
        label += " (for loop)"
    elif isinstance(node, ast.While):
        label += " (while loop)"
    elif isinstance(node, ast.With):
        label += " (with statement)"
    elif isinstance(node, ast.Lambda):
        label += " (lambda)"
    elif isinstance(node, ast.BoolOp):
        label += f" (bool op: {type(node.op).__name__})"
    elif isinstance(node, ast.Compare):
        label += " (compare)"
    elif isinstance(node, ast.UnaryOp):
        label += f" (unary op: {type(node.op).__name__})"
    elif isinstance(node, ast.Subscript):
        label += " (subscript)"
    elif isinstance(node, ast.List):
        label += " (list)"
    elif isinstance(node, ast.Dict):
        label += " (dict)"

    return label

In [4]:
def ast_to_graph(code, graph=None, parent=None):
    node = ast.parse(code)
    if graph is None:
        graph = graphviz.Digraph()
    
    label = getNodeLabel(node)
    
    # Aggiungi il nodo al grafo
    node_id = str(id(node))
    graph.node(node_id, label)
    
    # Se c'è un nodo genitore, crea un bordo (edge)
    if parent:
        graph.edge(parent, node_id)
    
    # Itera sui campi del nodo per trovare eventuali figli
    for field_name, value in ast.iter_fields(node):
        if isinstance(value, list):
            for item in value:
                if isinstance(item, ast.AST):
                    ast_to_graph(item, graph, node_id)
        elif isinstance(value, ast.AST):
            ast_to_graph(value, graph, node_id)
    
    return graph

In [5]:
def calculate_plagiarism_percentage(cosineSimilarity, astSimilarity, cosine_weight=0.5, ast_weight=0.5):
    return (cosineSimilarity * cosine_weight) + (astSimilarity * ast_weight)

In [6]:
def read_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        code = (file.read())
    return code

In [7]:
def getCosineSimilarity(codeA, codeB):
    vectorizer = TfidfVectorizer(stop_words=COMMON_KEYWORDS['python'], max_features=1000).fit_transform([codeA, codeB]) # dopo aggiungere COMMON_KEYWORDS e forse COMMON_PATTERNS
    vectors = vectorizer.toarray()
    similarity = cosine_similarity(vectors)[0, 1]
    if similarity < 0:
        similarity = 0
        
    return similarity * 100

In [8]:
def getAST_similarity(codeA, codeB):
    treeA = ast.parse(codeA)
    treeB = ast.parse(codeB)
    #print(ast.dump(treeA.body[0]))
    similarity = compare_ast_nodes(treeA, treeB)
    return similarity * 100

def compare_ast_lists(childListA, childListB):
    if len(childListA) != len(childListB):
        return 0.3
    if len(childListA) == 0 and len(childListB) == 0:
        return 1

    scores = [compare_ast_nodes(n1, n2) for n1, n2 in zip(childListA, childListB)]
    return sum(scores) / len(scores)

def compare_ast_nodes(nodeA, nodeB):
    if type(nodeA) != type(nodeB):
        return 0.1

    score = 1.0
    for field in nodeA._fields:
        valueA = getattr(nodeA, field, None)
        valueB = getattr(nodeB, field, None)
        
        if isinstance(valueA, list) and isinstance(valueB, list):
            listSimilarity = compare_ast_lists(valueA, valueB)
            score *= listSimilarity
        elif isinstance(valueA, ast.AST) and isinstance(valueB, ast.AST):
            nodeSimilarity = compare_ast_nodes(valueA, valueB)
            score *= nodeSimilarity
        else:
            if valueA != valueB:
                score *= 0.9
    
    return score

In [9]:
tresholdCosineSimilarity = 50

In [10]:
printAST_trees = False
dirPath = './Codes/'
fileA_path = 'A/dfs_A.txt'
fileB_path = 'B/dfs_B.txt'
codeA = read_code(os.path.join(dirPath, fileA_path))
codeB = read_code(os.path.join(dirPath, fileB_path))

if printAST_trees: 
    dotGrapthA = ast_to_graph(codeA)
    dotGrapthA.render('ast_codeA', format='png', view=True)
    dotGrapthB = ast_to_graph(codeB)
    dotGrapthB.render('ast_codeB', format='png', view=True)

cosineSimilarity = getCosineSimilarity(codeA, codeB)
astSimilarity = getAST_similarity(codeA, codeB)
finalScore = calculate_plagiarism_percentage(cosineSimilarity, astSimilarity, 0.3, 0.7)

print(f"Results: \nSimilarity prob: {finalScore:.2f}%")
print(f"    Cosine similarity: {cosineSimilarity:.2f}%")
print(f"    AST similarity probability: {astSimilarity:.2f}% \n")
if finalScore >= 75:
    print(f"Comments: \nHigh probability of plagiarism.")
elif finalScore > 50 and finalScore < 75:
    print(f"Comments: \nSlight probability of plagiarism. Human control is recommended.")
else:
    print(f"Comments: \nSmall probability of plagiarism.")

match(cosineSimilarity, astSimilarity):
    case (cosineSimilarity, astSimilarity) if cosineSimilarity >= 75:
        print(f"    The semantic of the two codes are too similar.")
    case (cosineSimilarity, astSimilarity) if astSimilarity >= 75:
        print(f"    The patters/simbologies of the two codes are too similar.")




Results: 
Similarity prob: 49.89%
    Cosine similarity: 54.59%
    AST similarity probability: 47.88% 

Comments: 
Small probability of plagiarism.


