In [1]:
import zipfile
import pandas as pd
import os

# Read the CSV file into a pandas DataFrame
papers = pd.read_csv("train.csv")

# Print head
papers.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [2]:
# Remove the columns
papers = papers["ABSTRACT"]

# sample only 100 papers
papers = papers.sample(100)

# Print out the first rows of papers
papers.head()

12573      The rapid development of deep learning, a fa...
1500       We consider a theory of a two-component Dira...
20509      Adaptive optic (AO) systems delivering high ...
1660       Inverse problems in statistical physics are ...
10326      Observations of nine transits of WASP-107 du...
Name: ABSTRACT, dtype: object

In [4]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = papers.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['the', 'rapid', 'development', 'of', 'deep', 'learning', 'family', 'of', 'machine', 'learning', 'techniques', 'has', 'spurred', 'much', 'interest', 'in', 'its', 'application', 'to', 'medical', 'imaging', 'problems', 'here', 'we', 'develop', 'deep', 'learning', 'algorithm', 'that', 'can']


In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [6]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

[nltk_data] Downloading package stopwords to C:\Users\Oshadha
[nltk_data]     Abeyrathne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#!python -m spacy download en_core_web_sm
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['rapid', 'development', 'deep', 'learn', 'family', 'machine', 'learn', 'technique', 'spur', 'much', 'interest', 'application', 'medical', 'imaging', 'problem', 'develop', 'deep', 'learn', 'accurately', 'detect', 'breast', 'cancer', 'screening', 'mammogram', 'use', 'end', 'end', 'training', 'approach', 'efficiently']


In [8]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 4), (5, 1), (6, 3), (7, 2), (8, 4), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 2), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 2), (25, 3), (26, 1), (27, 1), (28, 1), (29, 1)]


In [9]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [10]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.009*"overrightarrow" + 0.009*"model" + 0.008*"show" + 0.008*"theory" + '
  '0.007*"field" + 0.007*"network" + 0.007*"factor" + 0.007*"formula" + '
  '0.006*"space" + 0.006*"paper"'),
 (1,
  '0.020*"error" + 0.016*"field" + 0.012*"compute" + 0.012*"relative" + '
  '0.009*"bound" + 0.008*"magnetic" + 0.007*"flow" + 0.007*"function" + '
  '0.007*"system" + 0.007*"convection"'),
 (2,
  '0.009*"transmission" + 0.008*"group" + 0.008*"image" + 0.008*"community" + '
  '0.008*"even" + 0.008*"propose" + 0.007*"model" + 0.007*"regression" + '
  '0.007*"order" + 0.007*"use"'),
 (3,
  '0.014*"mesh" + 0.008*"method" + 0.008*"graph" + 0.008*"brain" + '
  '0.007*"propose" + 0.007*"use" + 0.007*"base" + 0.007*"communication" + '
  '0.007*"knowledge" + 0.006*"demonstrate"'),
 (4,
  '0.009*"contact" + 0.009*"film" + 0.008*"function" + 0.007*"model" + '
  '0.007*"uplink" + 0.007*"ptse" + 0.006*"layer" + 0.006*"base" + '
  '0.006*"property" + 0.006*"field"'),
 (5,
  '0.010*"model" + 0.009*"proble

In [14]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.34401556544759415
