In [275]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from sklearn import metrics
import matplotlib.pyplot as plt
from itertools import cycle

from nltk.corpus import stopwords
nlp = spacy.load('en_core_web_md')
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans

%matplotlib inline

In [276]:
#import data
papers = pd.read_csv('papers.csv').apply(lambda x: x.astype(str).str.lower())
#make data frame which has only title and text columns
papers_reduced = papers.filter(['title','paper_text'], axis=1)[0:50]



In [277]:
#divide papers into sentences
papers_reduced['paper_text_sentences'] = papers_reduced['paper_text'].apply(lambda x: sent_tokenize(x))


In [278]:
lemma = nltk.WordNetLemmatizer()


#function which operates on a single sentences, cleans its stopwords, 
#lemmatizes it, removes number and finally vectorizes it
def process_sent(sent):
    tokenized = nltk.word_tokenize(sent)
    cleaned = [word for word in tokenized if word not in stopwords.words('english') and word.isalpha() and len(word)>5]
    lemmatized = list(map(lambda x: lemma.lemmatize(x), cleaned))
    
    joined_back = ' '.join(lemmatized)
    nlp_joined_back = nlp(joined_back)
    return nlp(joined_back)
#create empty dataframe
papers_dataframe = pd.DataFrame(columns=['paper_name', 'sentence_vectors', 'sentences'])

#iterate through all the papers
for row in papers_reduced.itertuples():
    for sent in row.paper_text_sentences:
        vect = process_sent(sent)
        new_df = pd.DataFrame([[row.title, vect.vector, vect]], columns=['paper_name', 'sentence_vectors', 'sentences'])
        papers_dataframe = papers_dataframe.append(new_df)
    


In [286]:
#for initial centroids we generate sentences from 10 most common words in each paper
all_papers = papers_reduced["paper_text"]
initial_centroids_sent = []
for p in all_papers:
    tokenized = nltk.word_tokenize(p)
    cleaned = [word for word in tokenized if word not in stopwords.words('english') and word.isalpha() and len(word)>7]
    lemmatized = list(map(lambda x: lemma.lemmatize(x), cleaned))
    fd = nltk.FreqDist(lemmatized)
    most_common_tuples = fd.most_common(50)
    most_common_list = list(map(lambda x: x[0], most_common_tuples))
    initial_centroids_sent.append(' '.join(most_common_list))

vectorized_centroids = np.asarray(list(map(lambda x: nlp(x).vector, initial_centroids_sent)))






In [287]:
print(type(initial_centroids[0][0]))

<class 'numpy.float32'>


In [306]:
paper_names = set(papers_dataframe['paper_name'].tolist())
initial_centroids = []

for name in paper_names:
    sentences = papers_dataframe.loc[papers_dataframe['paper_name'] == name]['sentence_vectors'].tolist()
    initial_centroids.append(sentences[int(len(sentences)/2)])
initial_centroids = np.asarray(initial_centroids)
kmeans = KMeans(n_clusters=50,max_iter=300, init=vectorized_centroids, n_init=1)
# papers_dataframe.reset_index( inplace=True)
km = kmeans.fit(papers_dataframe['sentence_vectors'].tolist())
x = km.fit_predict(papers_dataframe['sentence_vectors'].tolist())
papers_dataframe["Cluster"]= x




In [307]:
print(set(papers_dataframe.loc[papers_dataframe['Cluster'] == 2]['paper_name']))

{'a model of transparent motion and non-transparent motion aftereffects', 'the gamma mlp for speech phoneme recognition', 'a dynamical model of context dependencies for the vestibulo-ocular reflex', 'a computational model of prefrontal cortex function', 'iceg morphology classification using an analogue vlsi neural network', 'ocular dominance and patterned lateral connections in a self-organizing model of the primary visual cortex', 'storing covariance by the associative long-term potentiation and depression of synaptic strengths in the hippocampus', 'associative decorrelation dynamics: a theory of self-organization and optimization in feedback networks', 'pulsestream synapses with non-volatile analogue amorphous-silicon memories', 'a mean field theory of layer iv of visual cortex and its application to artificial neural networks', 'quadratic-type lyapunov functions for competitive neural networks with different time-scales', 'exponentially many local minima for single neurons', 'correl

In [308]:
print(papers_dataframe.loc[papers_dataframe['paper_name'] == name]['Cluster'].tolist())

[15, 15, 46, 15, 46, 48, 15, 22, 1, 46, 12, 39, 39, 36, 48, 32, 22, 15, 15, 4, 23, 47, 36, 47, 11, 44, 23, 10, 22, 43, 43, 12, 44, 20, 19, 38, 20, 47, 32, 39, 22, 44, 15, 24, 4, 9, 36, 14, 8, 46, 25, 1, 39, 22, 39, 22, 39, 19, 22, 44, 33, 12, 44, 22, 46, 8, 14, 44, 8, 32, 19, 15, 24, 15, 46, 46, 34, 33, 9, 9, 44, 22, 12, 33, 44, 46, 35, 12, 41, 15, 24, 23, 44, 19, 32, 44, 46, 5, 15, 5, 22, 22, 44, 44, 46, 46, 22, 14, 47, 47, 19, 3, 12, 22, 12, 46, 12, 12, 22, 46, 17, 31, 19, 15, 15, 33, 11, 15, 36, 15, 10, 22, 15, 24, 35, 15, 19, 33, 35, 32, 22, 44, 46, 33, 12, 1, 17, 38, 12, 22, 31, 46, 46, 15, 31, 23, 3, 44, 46, 29, 26, 45, 22, 18, 1, 29, 45]


In [309]:
ll = [15, 15, 46, 15, 46, 48, 15, 22, 1, 46, 12, 39, 39, 36, 48, 32, 22, 15, 15, 4, 23, 47, 36, 47, 11, 44, 23, 10, 22, 43, 43, 12, 44, 20, 19, 38, 20, 47, 32, 39, 22, 44, 15, 24, 4, 9, 36, 14, 8, 46, 25, 1, 39, 22, 39, 22, 39, 19, 22, 44, 33, 12, 44, 22, 46, 8, 14, 44, 8, 32, 19, 15, 24, 15, 46, 46, 34, 33, 9, 9, 44, 22, 12, 33, 44, 46, 35, 12, 41, 15, 24, 23, 44, 19, 32, 44, 46, 5, 15, 5, 22, 22, 44, 44, 46, 46, 22, 14, 47, 47, 19, 3, 12, 22, 12, 46, 12, 12, 22, 46, 17, 31, 19, 15, 15, 33, 11, 15, 36, 15, 10, 22, 15, 24, 35, 15, 19, 33, 35, 32, 22, 44, 46, 33, 12, 1, 17, 38, 12, 22, 31, 46, 46, 15, 31, 23, 3, 44, 46, 29, 26, 45, 22, 18, 1, 29, 45]

In [310]:
print(collections.Counter(ll))

Counter({15: 18, 22: 18, 46: 17, 44: 14, 12: 11, 19: 7, 39: 6, 33: 6, 32: 5, 47: 5, 1: 4, 36: 4, 23: 4, 24: 4, 9: 3, 14: 3, 8: 3, 35: 3, 31: 3, 48: 2, 4: 2, 11: 2, 10: 2, 43: 2, 20: 2, 38: 2, 5: 2, 3: 2, 17: 2, 29: 2, 45: 2, 25: 1, 34: 1, 41: 1, 26: 1, 18: 1})


In [311]:
print(len(set(papers_dataframe.loc[papers_dataframe['Cluster'] == 15]['paper_name'].tolist())))

45
