In [355]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from sklearn import metrics
import matplotlib.pyplot as plt
from itertools import cycle

from nltk.corpus import stopwords
nlp = spacy.load('en_core_web_md')
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
lemma = nltk.WordNetLemmatizer()


n_most_common = 50

In [356]:
#import data
papers = pd.read_csv('papers.csv').apply(lambda x: x.astype(str).str.lower())
#make data frame which has only title and text columns
papers_reduced = papers.filter(['title','paper_text'], axis=1)[0:50]



In [357]:
#divide papers into sentences
papers_reduced['paper_text_sentences'] = papers_reduced['paper_text'].apply(lambda x: sent_tokenize(x))


In [358]:
#function which operates on a single sentences, cleans its stopwords, 
#lemmatizes it, removes numbers and finally vectorizes it
def process_sent(sent):
    tokenized = nltk.word_tokenize(sent)
    cleaned = [word for word in tokenized if word not in stopwords.words('english') and word.isalpha() and len(word)>5]
    lemmatized = list(map(lambda x: lemma.lemmatize(x), cleaned))
    joined_back = ' '.join(lemmatized)
    nlp_joined_back = nlp(joined_back)
    return nlp(joined_back)
#create empty dataframe to hold all sentences (string and vector format) 
#along with names of papers they are coming from
papers_dataframe = pd.DataFrame(columns=['paper_name', 'sentence_vectors', 'sentences'])

#iterate through all the papers to fill papers_dataframe
for row in papers_reduced.itertuples():
    for sent in row.paper_text_sentences:
        vect = process_sent(sent)
        new_df = pd.DataFrame([[row.title, vect.vector, vect]], columns=['paper_name', 'sentence_vectors', 'sentences'])
        papers_dataframe = papers_dataframe.append(new_df)
    


In [359]:
#for initial centroids we generate sentences from n_most_common most common words in each paper
all_papers = papers_reduced["paper_text"]
initial_centroids_sent = []
#tokenize each word of each paper, find n_most_common ones and join them to create sentences
for p in all_papers:
    tokenized = nltk.word_tokenize(p)
    cleaned = [word for word in tokenized if word not in stopwords.words('english') and word.isalpha() and len(word)>7]
    lemmatized = list(map(lambda x: lemma.lemmatize(x), cleaned))
    fd = nltk.FreqDist(lemmatized)
    most_common_tuples = fd.most_common(n_most_common)
    most_common_list = list(map(lambda x: x[0], most_common_tuples))
    initial_centroids_sent.append(' '.join(most_common_list))
#vectorize the sentences which will be initial centroids
vectorized_centroids = np.asarray(list(map(lambda x: nlp(x).vector, initial_centroids_sent)))






In [360]:
kmeans = KMeans(n_clusters=50,max_iter=300, init=vectorized_centroids, n_init=1)
km = kmeans.fit(papers_dataframe['sentence_vectors'].tolist())
x = km.fit_predict(papers_dataframe['sentence_vectors'].tolist())
papers_dataframe["cluster"]= x




In [361]:
paper_names = list(set(papers_dataframe['paper_name']))

In [362]:
#check in how many different clusters one paper appears in
names_number_of_clusters = []
for n in paper_names:
    names_clusters = papers_dataframe.loc[papers_dataframe['paper_name'] == n]['cluster']
    names_number_of_clusters.append(len(collections.Counter(names_clusters).keys()))

    
#number of clusters for each paper (mean, median mode)
import statistics
mean_names_number_of_clusters = statistics.mean(names_number_of_clusters)
print("mean: "+str(mean_names_number_of_clusters))
mode_names_number_of_clusters = statistics.mode(names_number_of_clusters)
print("mode: "+str(mode_names_number_of_clusters))

mean: 31.2
mode: 33


As we can see, the mean average of different clusters which sentences of a paper appear in is 31.2. Ideally, we would like for the sentences to appear in 1-2 papers, which leads to conclusion that this method isn't viable for plagiarism detection in a set of papers. Ideas for improvement can be found in the README file.