In [1]:
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
def preprocess_code(code):
    code = re.sub(r'\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)
    code = re.sub(r'\"\"\".*?\"\"\"', '', code, flags=re.DOTALL) 
    code = re.sub(r'#.*', '', code) 
    
    code = re.sub(r'^\s*(import|from)\s+[^\n]+', '', code, flags=re.MULTILINE)
    
    return code

In [3]:
student_files = [doc for doc in os.listdir('code_data') if doc.endswith('.py')]

In [4]:
student_code = [preprocess_code(open(os.path.join('code_data', _file), encoding='utf-8', errors='ignore').read())
                for _file in student_files]


In [5]:
def vectorize(Text): 
    return TfidfVectorizer(token_pattern=r'\b\w+\b').fit_transform(Text).toarray()


In [6]:
def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])


In [7]:
vectors = vectorize(student_code)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()

In [8]:
def check_plagiarism():
    for student_a, text_vector_a in s_vectors:
        for student_b, text_vector_b in s_vectors:
            if student_a != student_b:
                sim_score = similarity(text_vector_a, text_vector_b)[0][1]
                if sim_score > 0.5:
                    student_pair = sorted((student_a, student_b))
                    score = (student_pair[0], student_pair[1], sim_score)
                    plagiarism_results.add(score)
    return plagiarism_results

In [9]:
for data in check_plagiarism():
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')


a34.py vs a45.py: Similarity Score: 0.98
a13.py vs a41.py: Similarity Score: 0.79
a3.py vs a5.py: Similarity Score: 0.56
a5.py vs a7.py: Similarity Score: 0.73
a12.py vs a30.py: Similarity Score: 0.77
a2.py vs a25.py: Similarity Score: 0.86
a10.py vs a7.py: Similarity Score: 0.88
a10.py vs a5.py: Similarity Score: 0.71
a14.py vs a26.py: Similarity Score: 0.68
a43.py vs a47.py: Similarity Score: 0.73
a10.py vs a3.py: Similarity Score: 0.65
a4.py vs a5.py: Similarity Score: 0.53
a3.py vs a7.py: Similarity Score: 0.64
a36.py vs a44.py: Similarity Score: 0.57


In [3]:
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_code(code):
    code = re.sub(r'\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)
    code = re.sub(r'\"\"\".*?\"\"\"', '', code, flags=re.DOTALL) 
    code = re.sub(r'#.*', '', code) 
    code = re.sub(r'^\s*(import|from)\s+[^\n]+', '', code, flags=re.MULTILINE)
    return code

def vectorize(text): 
    return TfidfVectorizer(token_pattern=r'\b\w+\b').fit_transform(text).toarray()

def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])[0][1]

def check_similarity_with_input(input_file_path):
    input_code = preprocess_code(open(input_file_path, encoding='utf-8', errors='ignore').read())
    
    student_files = [doc for doc in os.listdir('code_data') if doc.endswith('.py') and doc != os.path.basename(input_file_path)]
    student_code = [preprocess_code(open(os.path.join('code_data', _file), encoding='utf-8', errors='ignore').read())
                    for _file in student_files]
    
    all_texts = [input_code] + student_code
    vectors = vectorize(all_texts)
    
    input_vector = vectors[0]
    plagiarism_results = []
    for i, vector in enumerate(vectors[1:], 1):
        sim_score = similarity(input_vector, vector)
        if sim_score > 0.5:
            plagiarism_results.append((os.path.basename(input_file_path), student_files[i-1], sim_score))
    
    return plagiarism_results

input_file = 'code_data/a5.py'
results = check_similarity_with_input(input_file)
for data in results:
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')


a5.py vs a10.py: Similarity Score: 0.71
a5.py vs a3.py: Similarity Score: 0.56
a5.py vs a4.py: Similarity Score: 0.53
a5.py vs a7.py: Similarity Score: 0.73


In [2]:
import os
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def preprocess_code(code):
    code = re.sub(r'\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)
    code = re.sub(r'\"\"\".*?\"\"\"', '', code, flags=re.DOTALL) 
    code = re.sub(r'#.*', '', code) 
    code = re.sub(r'^\s*(import|from)\s+[^\n]+', '', code, flags=re.MULTILINE)
    return code

def train_doc2vec_model(texts):
    tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]
    model = Doc2Vec(vector_size=100, window=5, min_count=1, epochs=100, dm=1)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def check_similarity_with_input(input_file_path):
    input_code = preprocess_code(open(input_file_path, encoding='utf-8', errors='ignore').read())
    
    student_files = [doc for doc in os.listdir('code_data') if doc.endswith('.py') and doc != os.path.basename(input_file_path)]
    student_code = [preprocess_code(open(os.path.join('code_data', _file), encoding='utf-8', errors='ignore').read())
                    for _file in student_files]
    
    all_texts = [input_code] + student_code
    model = train_doc2vec_model(all_texts)
    
    input_vector = model.infer_vector(input_code.split())
    plagiarism_results = []
    
    for i, code in enumerate(student_code):
        sim_score = model.similarity_unseen_docs(input_code.split(), code.split())
        if sim_score > 0.5:
            plagiarism_results.append((os.path.basename(input_file_path), student_files[i], sim_score))
    
    return plagiarism_results

input_file = 'code_data/a5.py'
results = check_similarity_with_input(input_file)
for data in results:
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')


a5.py vs a1.py: Similarity Score: 0.83
a5.py vs a10.py: Similarity Score: 0.87
a5.py vs a2.py: Similarity Score: 0.72
a5.py vs a3.py: Similarity Score: 0.79
a5.py vs a4.py: Similarity Score: 0.62
a5.py vs a47.py: Similarity Score: 0.51
a5.py vs a6.py: Similarity Score: 0.66
a5.py vs a7.py: Similarity Score: 0.88
a5.py vs a8.py: Similarity Score: 0.62


In [5]:
import os
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from difflib import SequenceMatcher

def preprocess_code(code):
    code = re.sub(r'\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)
    code = re.sub(r'\"\"\".*?\"\"\"', '', code, flags=re.DOTALL) 
    code = re.sub(r'#.*', '', code) 
    code = re.sub(r'^\s*(import|from)\s+[^\n]+', '', code, flags=re.MULTILINE)
    return code

def train_doc2vec_model(texts):
    tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]
    model = Doc2Vec(vector_size=100, window=5, min_count=1, epochs=100, dm=1)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def get_matching_blocks(text1, text2):
    s = SequenceMatcher(None, text1, text2)
    matching_blocks = s.get_matching_blocks()
    matches = []
    for match in matching_blocks:
        if match.size > 0:
            matches.append(text1[match.a: match.a + match.size])
    return matches

def check_similarity_with_input(input_file_path):
    input_code = preprocess_code(open(input_file_path, encoding='utf-8', errors='ignore').read())
    
    student_files = [doc for doc in os.listdir('code_data') if doc.endswith('.py') and doc != os.path.basename(input_file_path)]
    student_code = [preprocess_code(open(os.path.join('code_data', _file), encoding='utf-8', errors='ignore').read())
                    for _file in student_files]
    
    all_texts = [input_code] + student_code
    model = train_doc2vec_model(all_texts)
    
    input_vector = model.infer_vector(input_code.split())
    plagiarism_results = []
    
    for i, code in enumerate(student_code):
        sim_score = model.similarity_unseen_docs(input_code.split(), code.split())
        if sim_score > 0.5:
            plagiarism_results.append((os.path.basename(input_file_path), student_files[i], sim_score, code))
    
    return plagiarism_results

def display_matching_sections(input_code, code):
    matches = get_matching_blocks(input_code, code)
    print("Matching Sections:")
    for match in matches:
        print(f"- {match}")

input_file = 'code_data/a5.py'
results = check_similarity_with_input(input_file)
print("Similarity Scores for files with > 50% similarity:")
for data in results:
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')

file_to_check = input("\nEnter the file name to view matched lines (e.g., 'file.py'): ")

for data in results:
    if data[1] == file_to_check:
        print(f'\nMatching lines between {data[0]} and {data[1]}:')
        display_matching_sections(preprocess_code(open(input_file, encoding='utf-8', errors='ignore').read()), data[3])
        break
else:
    print("File not found in similarity results.")


Similarity Scores for files with > 50% similarity:
a5.py vs a1.py: Similarity Score: 0.83
a5.py vs a10.py: Similarity Score: 0.87
a5.py vs a2.py: Similarity Score: 0.72
a5.py vs a3.py: Similarity Score: 0.79
a5.py vs a4.py: Similarity Score: 0.62
a5.py vs a47.py: Similarity Score: 0.51
a5.py vs a6.py: Similarity Score: 0.66
a5.py vs a7.py: Similarity Score: 0.88
a5.py vs a8.py: Similarity Score: 0.62



Enter the file name to view matched lines (e.g., 'file.py'):  a8.py



Matching lines between a5.py and a8.py:
Matching Sections:
- def 
- p
- h
- s
- : {
- }")

