In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# Load all .txt files in current directory
student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]
# Or if you only want specific files, list them manually:
# student_files = ["example1.txt", "example2.txt"]

# Read contents of all files
student_notes = [open(_file, encoding='utf-8').read() for _file in student_files]

# Convert to TF-IDF vectors
def vectorize(texts):
    return TfidfVectorizer().fit_transform(texts).toarray()

vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))

def check_plagiarism():
    results = []
    # Compare every unique pair of files
    for (student_a, vec_a), (student_b, vec_b) in combinations(s_vectors, 2):
        sim_score = cosine_similarity([vec_a], [vec_b])[0][0]
        results.append((student_a, student_b, sim_score))
    return results

# Run only if we have 2+ files
if len(student_files) < 2:
    print("⚠️ Need at least 2 text files to check for plagiarism.")
else:
    for file_a, file_b, score in check_plagiarism():
        print(f"{file_a} <--> {file_b} : {score:.4f}")


example1.txt <--> example2.txt : 1.0000
