In [None]:
# Import necessary modules!
import os  # Module for interacting with the operating system
from sklearn.feature_extraction.text import TfidfVectorizer  # Module for text vectorization using TF-IDF
from sklearn.metrics.pairwise import cosine_similarity  # Module for calculating cosine similarity

In [None]:
# Get a list of all text files in the current directory
student_files = [doc for doc in os.listdir() if doc.endswith('.txt')]

In [None]:
student_files

['Arthur.txt', 'Clark.txt', 'Ben.txt']

In [None]:
# Read the contents of each student's text file
student_notes = [open(_file, encoding='utf-8').read() for _file in student_files]

In [None]:
student_notes

['Success can mean a variety of different things. Success is, quite simply, the accomplishment of a predetermined goal. To some people it could mean making money, cultivate and develop certain basic qualities, to others it could mean keeping everyone happy, but to me, it means achieving the goals and objective I have set for myself for my life. Besides working on your goals that would lead a person towards success it is very important to push your limit every day, take charge of your life, and keep learning. This experience enables us to think smartly to solve a critical problem and achieve success. It is very important to take care of your mind which could be done by eliminating negative thoughts and negative people from your life. I think in order to call something successful, both the result and the process should be great. Without success, you, the group, your company, your goals, dreams and even entire civilizations cease to survive.',
 'Success is, quite simply, the accomplishmen

In [None]:
# Function to vectorize the text using TF-IDF
def vectorize(Text):
    return TfidfVectorizer().fit_transform(Text).toarray()

In [None]:
# Function to calculate cosine similarity between two documents
def similarity(doc1, doc2):
    return cosine_similarity([doc1, doc2])

In [None]:
# Vectorize the student notes using TF-IDF
vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()

In [None]:
s_vectors

[('Arthur.txt',
  array([0.        , 0.04119088, 0.05304076, 0.        , 0.        ,
         0.05304076, 0.        , 0.        , 0.        , 0.28833619,
         0.        , 0.05304076, 0.08238177, 0.        , 0.06974223,
         0.06974223, 0.        , 0.06974223, 0.05304076, 0.06974223,
         0.06974223, 0.        , 0.06974223, 0.        , 0.05304076,
         0.04119088, 0.06974223, 0.05304076, 0.        , 0.        ,
         0.05304076, 0.        , 0.        , 0.        , 0.        ,
         0.15912229, 0.05304076, 0.05304076, 0.06974223, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.05304076, 0.05304076, 0.        , 0.        , 0.06974223,
         0.05304076, 0.06974223, 0.        , 0.05304076, 0.        ,
         0.05304076, 0.05304076, 0.06974223, 0.06974223, 0.05304076,
         0.        , 0.        , 0.        , 0.10608153, 0.06974223,
         0.        , 0.04119088, 0.12357265, 0.06974223, 0.        ,
         0.0530407

In [None]:
# Function to check plagiarism among the student notes
def check_plagiarism():
    global s_vectors
    for student_a, text_vector_a in s_vectors:
        new_vectors = s_vectors.copy()
        current_index = new_vectors.index((student_a, text_vector_a))
        del new_vectors[current_index]
        for student_b, text_vector_b in new_vectors:
            # Calculate cosine similarity between two text vectors
            sim_score = similarity(text_vector_a, text_vector_b)[0][1]
            # Sort the student file names alphabetically to avoid duplicates
            student_pair = sorted((student_a, student_b))
            # Create a tuple with student file names and similarity score
            score = (student_pair[0], student_pair[1], sim_score)
            # Add the tuple to plagiarism_results set
            plagiarism_results.add(score)
    return plagiarism_results

In [None]:
# Print the plagiarism results
for data in check_plagiarism():
    print("Similarity data:\n", data)

Similarity data:
 ('Ben.txt', 'Clark.txt', np.float64(0.40890488440034695))
Similarity data:
 ('Arthur.txt', 'Ben.txt', np.float64(0.4595329317649596))
Similarity data:
 ('Arthur.txt', 'Clark.txt', np.float64(0.5430431121089816))
