In [2]:
import numpy as np
import pandas as pd
import os
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
text_files = [doc for doc in os.listdir('text_data') if doc.endswith('.txt')]

In [5]:
text_files[:5]

['g0pA_taska.txt',
 'g0pA_taskb.txt',
 'g0pA_taskc.txt',
 'g0pA_taskd.txt',
 'g0pA_taske.txt']

In [6]:
data=text_files[20]

In [7]:
file_path = os.path.join('text_data', data)

with open(file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()

print(file_content)

In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula. The new classes, known as derived classes, take over (or inherit) attribute and behaviour of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification. Inheritance provides the support for representation by categorization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization (what is known about specific entities is applied to a wider group given a belongs relation can be established) and cognitive economy (less information needs to be stored about each specific entity, only its particularities). Inheritance is also sometimes called generalization, because the is-a relationships 

In [8]:
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [9]:
text_data = [lemmatize_text(open(os.path.join('text_data', _file), encoding='utf-8', errors='ignore').read())
                 for _file in text_files]

In [10]:
lemma_text=text_data[1]
lemma_text

'PageRank link analysis algorithm Google Internet search engine assign numerical weighting element hyperlinked set document World Wide Web purpose measure relative importance set Google assign numeric weighting 0 10 webpage internet PageRank denote site importance eye Google \n\n PageRank derive theoretical probability value logarithmic scale like Richter Scale PageRank particular page roughly base quantity inbound link PageRank page provide link algorithm apply collection entity reciprocal quotation reference numerical weight assign give element e call PageRank E denote PR(E \n\n know factor e.g. relevance search word page actual visit page report Google toolbar influence PageRank link base rank algorithm web page include HITS algorithm invent Jon Kleinberg teoma ask.com IBM CLEVER project TrustRank algorithm \n'

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
def vectorize(Text): 
    return TfidfVectorizer().fit_transform(Text).toarray()

In [13]:
tfidf_matrix = vectorize(text_data[1:2])

In [14]:
vectorizer = TfidfVectorizer()
vectorizer.fit(text_data[1:2]) 
tokens = vectorizer.get_feature_names_out()  

tfidf_values = tfidf_matrix[0]
for token, value in zip(tokens, tfidf_values):
    print(f"{token}->{value:.4f}")

10->0.0643
actual->0.0643
algorithm->0.3214
analysis->0.0643
apply->0.0643
ask->0.0643
assign->0.1928
base->0.1286
call->0.0643
clever->0.0643
collection->0.0643
com->0.0643
denote->0.1286
derive->0.0643
document->0.0643
element->0.1286
engine->0.0643
entity->0.0643
eye->0.0643
factor->0.0643
give->0.0643
google->0.2571
hits->0.0643
hyperlinked->0.0643
ibm->0.0643
importance->0.1286
inbound->0.0643
include->0.0643
influence->0.0643
internet->0.1286
invent->0.0643
jon->0.0643
kleinberg->0.0643
know->0.0643
like->0.0643
link->0.2571
logarithmic->0.0643
measure->0.0643
numeric->0.0643
numerical->0.1286
page->0.3214
pagerank->0.4500
particular->0.0643
pr->0.0643
probability->0.0643
project->0.0643
provide->0.0643
purpose->0.0643
quantity->0.0643
quotation->0.0643
rank->0.0643
reciprocal->0.0643
reference->0.0643
relative->0.0643
relevance->0.0643
report->0.0643
richter->0.0643
roughly->0.0643
scale->0.1286
search->0.1286
set->0.1286
site->0.0643
teoma->0.0643
theoretical->0.0643
toolbar->0

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])

In [17]:
vectors = vectorize(text_data)
s_vectors = list(zip(text_files, vectors))
plagiarism_results = set()

In [18]:
def check_plagiarism():
    for student_a, text_vector_a in s_vectors:
        for student_b, text_vector_b in s_vectors:
            if student_a != student_b:
                sim_score = similarity(text_vector_a, text_vector_b)[0][1]
                if sim_score > 0.5:
                    student_pair = sorted((student_a, student_b))
                    score = (student_pair[0], student_pair[1], sim_score)
                    plagiarism_results.add(score)
    return plagiarism_results

In [19]:
for data in check_plagiarism():
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')

g0pD_taskc.txt vs g3pA_taskc.txt: Similarity Score: 0.72
g0pA_taskd.txt vs g2pB_taskd.txt: Similarity Score: 0.51
g0pC_taskd.txt vs g3pA_taskd.txt: Similarity Score: 0.90
g0pE_taska.txt vs orig_taska.txt: Similarity Score: 0.97
g0pA_taske.txt vs g4pB_taske.txt: Similarity Score: 0.50
g1pA_taskd.txt vs g4pC_taskd.txt: Similarity Score: 0.67
g4pC_taska.txt vs g4pE_taska.txt: Similarity Score: 0.50
g0pA_taske.txt vs g2pC_taske.txt: Similarity Score: 0.63
g2pA_taskb.txt vs g3pA_taskb.txt: Similarity Score: 0.73
g0pE_taskc.txt vs g1pD_taskc.txt: Similarity Score: 0.66
g0pA_taskd.txt vs g2pC_taskd.txt: Similarity Score: 0.63
g0pB_taske.txt vs g4pB_taske.txt: Similarity Score: 0.66
g1pD_taskc.txt vs g3pC_taskc.txt: Similarity Score: 0.52
g2pB_taske.txt vs g4pD_taske.txt: Similarity Score: 0.51
g0pC_taska.txt vs g4pC_taska.txt: Similarity Score: 0.53
g1pB_taskc.txt vs g2pA_taskc.txt: Similarity Score: 0.84
g2pA_taske.txt vs g2pC_taske.txt: Similarity Score: 0.53
g1pD_taska.txt vs g2pC_taska.tx

In [20]:
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

def vectorize(text_data): 
    return TfidfVectorizer().fit_transform(text_data).toarray()

def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])[0][1]

def check_similarity_with_input(input_file_path):
    with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as file:
        input_text = lemmatize_text(file.read())
    
    folder_path = 'text_data'
    other_texts = []
    other_files = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt') and filename != os.path.basename(input_file_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                text = lemmatize_text(file.read())
            other_texts.append(text)
            other_files.append(filename)
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([input_text] + other_texts)
    
    input_vector = tfidf_matrix[0]
    similarity_scores = cosine_similarity(input_vector, tfidf_matrix[1:])[0]
    
    results = []
    for i, score in enumerate(similarity_scores):
        if score > 0.5:
            results.append((os.path.basename(input_file_path), other_files[i], score))
    
    return results

input_file = 'text_data/g2pA_taskc.txt'
results = check_similarity_with_input(input_file)
for data in results:
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')


g2pA_taskc.txt vs g0pA_taskc.txt: Similarity Score: 0.90
g2pA_taskc.txt vs g0pB_taskc.txt: Similarity Score: 0.92
g2pA_taskc.txt vs g0pD_taskc.txt: Similarity Score: 0.77
g2pA_taskc.txt vs g1pA_taskc.txt: Similarity Score: 0.65
g2pA_taskc.txt vs g1pB_taskc.txt: Similarity Score: 0.84
g2pA_taskc.txt vs g1pD_taskc.txt: Similarity Score: 0.52
g2pA_taskc.txt vs g2pB_taskc.txt: Similarity Score: 0.62
g2pA_taskc.txt vs g2pC_taskc.txt: Similarity Score: 0.53
g2pA_taskc.txt vs g3pA_taskc.txt: Similarity Score: 0.87
g2pA_taskc.txt vs g3pB_taskc.txt: Similarity Score: 0.70
g2pA_taskc.txt vs g4pB_taskc.txt: Similarity Score: 0.69
g2pA_taskc.txt vs g4pE_taskc.txt: Similarity Score: 0.61
g2pA_taskc.txt vs orig_taskc.txt: Similarity Score: 0.97


In [26]:
import os
import spacy
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

def vectorize_with_doc2vec(text_data):
    documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(text_data)]
    model = Doc2Vec(vector_size=100, window=3, min_count=3, workers=4, epochs=40)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def similarity(vec1, vec2):
    return cosine_similarity([vec1], [vec2])[0][0]

def l2_normalize(vector):
    norm = np.linalg.norm(vector)
    return vector / norm if norm != 0 else vector

def check_similarity_with_input(input_file_path):
    with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as file:
        input_text = lemmatize_text(file.read())
    
    folder_path = 'text_data'
    other_texts = []
    other_files = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt') and filename != os.path.basename(input_file_path):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                text = lemmatize_text(file.read())
            other_texts.append(text)
            other_files.append(filename)

    model = vectorize_with_doc2vec([input_text] + other_texts)
    input_vector = model.infer_vector(input_text.split())
    input_vector = l2_normalize(input_vector) 

    
    similarity_scores = []
    for text in other_texts:
        doc_vector = model.infer_vector(text.split())
        doc_vector = l2_normalize(doc_vector) 
        similarity_scores.append(similarity(input_vector, doc_vector))

    results = []
    for i, score in enumerate(similarity_scores):
        if score > 0.02:
            results.append((os.path.basename(input_file_path), other_files[i], score))
    
    return results

input_file = 'text_data/new_file.txt'
results = check_similarity_with_input(input_file)
for data in results:
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')


new_file.txt vs g0pA_taska.txt: Similarity Score: 0.28
new_file.txt vs g0pA_taskb.txt: Similarity Score: 0.73
new_file.txt vs g0pA_taskc.txt: Similarity Score: 0.06
new_file.txt vs g0pA_taskd.txt: Similarity Score: 0.06
new_file.txt vs g0pA_taske.txt: Similarity Score: 0.47
new_file.txt vs g0pB_taska.txt: Similarity Score: 0.30
new_file.txt vs g0pB_taskb.txt: Similarity Score: 0.57
new_file.txt vs g0pB_taskc.txt: Similarity Score: 0.05
new_file.txt vs g0pB_taske.txt: Similarity Score: 0.25
new_file.txt vs g0pC_taska.txt: Similarity Score: 0.51
new_file.txt vs g0pC_taskb.txt: Similarity Score: 0.44
new_file.txt vs g0pC_taskd.txt: Similarity Score: 0.09
new_file.txt vs g0pC_taske.txt: Similarity Score: 0.31
new_file.txt vs g0pD_taska.txt: Similarity Score: 0.47
new_file.txt vs g0pD_taskb.txt: Similarity Score: 0.80
new_file.txt vs g0pD_taskc.txt: Similarity Score: 0.05
new_file.txt vs g0pD_taske.txt: Similarity Score: 0.48
new_file.txt vs g0pE_taska.txt: Similarity Score: 0.52
new_file.t