## Using AST >> Syntax Comparison

In [7]:
import ast
from sentence_transformers import SentenceTransformer, util

In [8]:
def get_ast_structure(code_str):
    try:
        tree = ast.parse(code_str)
        return ast.dump(tree)
    except SyntaxError:
        return None

In [9]:
code1 = "def add(a, b): return a + b"
code2 = "def sum(x, y): return x + y"

print(get_ast_structure(code1))
print(get_ast_structure(code2))

Module(body=[FunctionDef(name='add', args=arguments(posonlyargs=[], args=[arg(arg='a', annotation=None, type_comment=None), arg(arg='b', annotation=None, type_comment=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=[Return(value=BinOp(left=Name(id='a', ctx=Load()), op=Add(), right=Name(id='b', ctx=Load())))], decorator_list=[], returns=None, type_comment=None)], type_ignores=[])
Module(body=[FunctionDef(name='sum', args=arguments(posonlyargs=[], args=[arg(arg='x', annotation=None, type_comment=None), arg(arg='y', annotation=None, type_comment=None)], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=[Return(value=BinOp(left=Name(id='x', ctx=Load()), op=Add(), right=Name(id='y', ctx=Load())))], decorator_list=[], returns=None, type_comment=None)], type_ignores=[])


In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')

vec1 = model.encode(code1, convert_to_tensor=True)
vec2 = model.encode(code2, convert_to_tensor=True)

similarity = util.pytorch_cos_sim(vec1, vec2)
print(f"Similarity Score: {similarity.item()*100:.2f}%")

Similarity Score: 76.04%


In [13]:
code1 = """def fibonacci(n):
    a, b = 0, 1
    for _ in range(n):
        print(a, end=" ")
        a, b = b, a + b
"""
code2 = """def F(n):
    if n <= 1:
        return n
    else:
        return F(n - 1) + F(n - 2)
"""

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')

vec1 = model.encode(code1, convert_to_tensor=True)
vec2 = model.encode(code2, convert_to_tensor=True)

similarity = util.pytorch_cos_sim(vec1, vec2)
print(f"Similarity Score: {similarity.item()*100:.2f}%")

Similarity Score: 64.33%


| Similarity % | Meaning                                         | Action              |
| ------------ | ----------------------------------------------- | ------------------- |
| 0–30%        | Likely original                                 | No issue            |
| 30–70%       | Some overlap (could be accidental/common logic) | Review manually     |
| 70–90%   | Likely copied or reused code                    | Investigate closely |
| 90–100%      | Almost identical                                | Strong plagiarism   |


## AST Based >> Logic Comparison

In [15]:
# Re-import and re-run due to kernel reset

import ast
import difflib

# Define the two code samples
code1 = """def fibonacci(n):
    a, b = 0, 1
    for _ in range(n):
        print(a, end=" ")
        a, b = b, a + b
"""

code2 = """def fibonacci(n):
    if n <= 1:
        return n
    else:
        return fibonacci(n - 1) + fibonacci(n - 2)
"""

# Parse into AST and dump structure
def get_ast_structure(code):
    try:
        tree = ast.parse(code)
        return ast.dump(tree, annotate_fields=False, include_attributes=False)
    except SyntaxError:
        return ""

ast1 = get_ast_structure(code1)
ast2 = get_ast_structure(code2)

# Use difflib to measure structure-level similarity
similarity = difflib.SequenceMatcher(None, ast1, ast2).ratio() * 100

similarity


26.877470355731226

In [16]:
code1 = "def add(a, b): return a + b"
code2 = "def sum(x, y): return x + y"

In [17]:
ast1 = get_ast_structure(code1)
ast2 = get_ast_structure(code2)

# Use difflib to measure structure-level similarity
similarity = difflib.SequenceMatcher(None, ast1, ast2).ratio() * 100

similarity

96.41025641025641

In [18]:
code1 = """def fibonacci_iterative(n):
    a, b = 0, 1
    for _ in range(n):
        print(a, end=" ")
        a, b = b, a + b
"""
code2 = """def fibonacci_recursive(n):
    if n <= 1:
        return n
    else:
        return fibonacci_recursive(n-1) + fibonacci_recursive(n-2)

for i in range(10):
    print(fibonacci_recursive(i), end=" ") # Prints the first 10 Fibonacci numbers
"""

In [19]:
ast1 = get_ast_structure(code1)
ast2 = get_ast_structure(code2)

# Use difflib to measure structure-level similarity
similarity = difflib.SequenceMatcher(None, ast1, ast2).ratio() * 100

similarity

30.59006211180124

## GitHub Code Search

In [None]:
import requests

# GitHub token (generate from https://github.com/settings/tokens, no scopes needed)
# GITHUB_TOKEN = r"token"

# Search query (can be a code fragment or function name)
query = "return fibonacci(n-1) + fibonacci(n-2)"


# GitHub Search API URL
url = f"https://api.github.com/search/code?q={query}+in:file+language:python"

# Headers with auth token
headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Send request
response = requests.get(url, headers=headers)

# Parse results
if response.status_code == 200:
    results = response.json()
    total = results['total_count']
    print(f"🔍 Found {total} matching code file(s) on GitHub:")
    
    for item in results['items'][:5]:  # Show top 5
        print(f"- {item['name']} at {item['html_url']}")
else:
    print(f"❌ GitHub API error: {response.status_code}")


🔍 Found 1336 matching code file(s) on GitHub:
- recurrsion.py at https://github.com/calistus-igwilo/python/blob/796f65dd6ebf29434481f7ddfce793910c42d8ca/recurrsion.py
- template.py at https://github.com/ArmelRandy/Self-instruct/blob/c58db251b785136e9b54ee3c915426d68d788e78/template.py
- d12_fib.py at https://github.com/19ceng/ceng104pro/blob/ebbb930fec94fc4a8c58b30a228bee312979a409/src/d12_fib.py
- bench.py at https://github.com/moraes/webapp-improved/blob/0e6218dcd3ba2e0ba0c6a6c87ba4fbe1eab287c4/lib/appengine-ndb-experiment/bench.py
- FiBoNaCci.py at https://github.com/technojam/Hacktoberfest-2020-Baby/blob/d1d757d8e23a8ed0b212c2d3a757c23f97eb6c6e/FiBoNaCci.py
