In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nips-papers/paper_authors.csv
/kaggle/input/nips-papers/papers.csv
/kaggle/input/nips-papers/authors.csv
/kaggle/input/nips-papers/database.sqlite


In [2]:
# load the dataset
df = pd.read_csv('/kaggle/input/nips-papers/papers.csv')
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
id            7241 non-null int64
year          7241 non-null int64
title         7241 non-null object
event_type    2422 non-null object
pdf_name      7241 non-null object
abstract      7241 non-null object
paper_text    7241 non-null object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [4]:
print("{} abstracts are missing".format(df[df['abstract']=='Abstract Missing']['abstract'].count()))

3317 abstracts are missing


In [5]:
import pprint
sample = 941
pprint.pprint("TITLE:{}".format(df['title'][sample]))
pprint.pprint("ABSTRACT:{}".format(df['abstract'][sample]))
pprint.pprint("FULL TEXT:{}".format(df['paper_text'][sample][:1000]))

'TITLE:Algorithms for Non-negative Matrix Factorization'
('ABSTRACT:Non-negative matrix factorization (NMF) has previously been shown '
 'to \r\n'
 'be a useful decomposition for multivariate data. Two different multi- \r\n'
 'plicative algorithms for NMF are analyzed. They differ only slightly in \r\n'
 'the multiplicative factor used in the update rules. One algorithm can be \r\n'
 'shown to minimize the conventional least squares error while the other \r\n'
 'minimizes the generalized Kullback-Leibler divergence. The monotonic \r\n'
 'convergence of both algorithms can be proven using an auxiliary func- \r\n'
 'tion analogous to that used for proving convergence of the Expectation- \r\n'
 'Maximization algorithm. The algorithms can also be interpreted as diag- \r\n'
 'onally rescaled gradient descent, where the rescaling factor is '
 'optimally \r\n'
 'chosen to ensure convergence. ')
('FULL TEXT:Algorithms for Non-negative Matrix\n'
 'Factorization\n'
 '\n'
 'Daniel D. Lee*\n'
 '*B

This dataset contains 7 columns: id, year, title, even_type, pdf_name, abstract and paper_text. We are mostly interested in the paper_text which include both title and abstract.

## Pre-processing

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [7]:
%%time
docs = df['paper_text'].apply(lambda x:pre_process(x))

CPU times: user 3min 31s, sys: 324 ms, total: 3min 32s
Wall time: 3min 32s


In [8]:
docs[1][0:103]

'mean field theory layer visual cortex application artificial neural network christopher scofield center'

## 1.TF-IDF and Scikit-learn

Based on the tutorial of [Kavita Ganesan](https://github.com/kavgan/nlp-in-practice/blob/master/tf-idf/Keyword%20Extraction%20with%20TF-IDF%20and%20SKlearn.ipynb)

TF-IDF stands for Text Frequency Inverse Document Frequency. The importance of each word increases proportionally to the number of times a word appears in the document (Text Frequency - TF) but is offset by the frequency of the word in the corpus (Inverse Document Frequency - IDF). Using the tf-idf weighting scheme, the keywords are the words with the higherst TF-IDF score.

### 1.1 CountVectorizer to create a vocabulary and generate word counts

In [9]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
#docs = docs.tolist()
#create a vocabulary of words, 
cv=CountVectorizer(max_df=0.95,         # ignore words that appear in 95% of documents
                   max_features=10000,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
word_count_vector=cv.fit_transform(docs)

CPU times: user 2min 45s, sys: 4.7 s, total: 2min 49s
Wall time: 2min 49s


### 1.2 TfidfTransformer to Compute Inverse Document Frequency (IDF)

In [10]:
%%time
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

CPU times: user 23.3 ms, sys: 21 ms, total: 44.4 ms
Wall time: 70.4 ms


TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

Once we have our IDF computed, we are now ready to compute TF-IDF and extract the top keywords.

In [11]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [12]:
# get feature names
feature_names=cv.get_feature_names()

def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [13]:
idx=941
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
update rule 0.344
update 0.285
auxiliary 0.212
non negative matrix 0.21
negative matrix 0.209
rule 0.192
nmf 0.183
multiplicative 0.175
matrix fa

 For instance, non-negative matrix factorization meets us 5 time: non negative matrix, negative matrix, nmf, matrix factorization, matrix. Adding a 4-grams does not change the situation. Similar keywords appears due to the fact that TF-IDF does not take into account the context, the keywords importance comes only from their frequencies relationship. Thus, TF-IDF is a quick, intuitive, but not the best way to extract keywords from the text. Let's look at other ways.

## 2. Gensim implementation of TextRank summarization algorithm

Gensim is a free Python library designed to automatically extract semantic topics from documents. The gensim implementation is based on the popular TextRank algorithm. 

[Documentation](https://radimrehurek.com/gensim/summarization/keywords.html)

[Tutorial](https://rare-technologies.com/text-summarization-with-gensim/)

### 2.1 Small text

In [14]:
import gensim
text = "Non-negative matrix factorization (NMF) has previously been shown to " + \
"be a useful decomposition for multivariate data. Two different multiplicative " + \
"algorithms for NMF are analyzed. They differ only slightly in the " + \
"multiplicative factor used in the update rules. One algorithm can be shown to " + \
"minimize the conventional least squares error while the other minimizes the  " + \
"generalized Kullback-Leibler divergence. The monotonic convergence of both  " + \
"algorithms can be proven using an auxiliary function analogous to that used " + \
"for proving convergence of the Expectation-Maximization algorithm. The algorithms  " + \
"can also be interpreted as diagonally rescaled gradient descent, where the  " + \
"rescaling factor is optimally chosen to ensure convergence."
gensim.summarization.keywords(text, 
         ratio=0.5,               # use 50% of original text
         words=None,              # Number of returned words
         split=True,              # Whether split keywords
         scores=False,            # Whether score of keyword
         pos_filter=('NN', 'JJ'), # Part of speech (nouns, adjectives etc.) filters
         lemmatize=True,         # If True - lemmatize words
         deacc=True)              # If True - remove accentuation

['factor',
 'convergence',
 'rescaling',
 'multiplicative',
 'function',
 'kullback',
 'gradient',
 'algorithm',
 'matrix',
 'useful decomposition',
 'multivariate',
 'data',
 'squares']

In [15]:
print("SUMMARY: ", gensim.summarization.summarize(text,
                                                  ratio = 0.5,
                                                  split = True))

SUMMARY:  ['Non-negative matrix factorization (NMF) has previously been shown to be a useful decomposition for multivariate data.', 'Two different multiplicative algorithms for NMF are analyzed.', 'They differ only slightly in the multiplicative factor used in the update rules.']


### 2.2 Large text

In [16]:
def get_keywords_gensim(idx, docs):
    
    keywords=gensim.summarization.keywords(docs[idx], 
                                  ratio=None, 
                                  words=10,         
                                  split=True,             
                                  scores=False,           
                                  pos_filter=None, 
                                  lemmatize=True,         
                                  deacc=True)              
    
    return keywords

def print_results_gensim(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k)

In [17]:
idx=941
keywords=get_keywords_gensim(idx, docs)
print_results_gensim(idx,keywords, df)


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
factorized
algorithm
matrix
update rule
function
data
converge
theorem
gradient


The keywords highlight the main point , but still miss valuable information

## 3. Yet Another Keyword Extractor (Yake)

[Documentation](https://github.com/LIAAD/yake)

In [18]:
!pip install git+https://github.com/LIAAD/yake

Collecting git+https://github.com/LIAAD/yake
  Cloning https://github.com/LIAAD/yake to /tmp/pip-req-build-9y4p6o_k
  Running command git clone -q https://github.com/LIAAD/yake /tmp/pip-req-build-9y4p6o_k
Collecting segtok (from yake==0.4.8)
  Downloading https://files.pythonhosted.org/packages/41/08/582dab5f4b1d5ca23bc6927b4bb977c8ff7f3a87a3b98844ef833e2f5623/segtok-1.5.10.tar.gz
Collecting jellyfish (from yake==0.4.8)
[?25l  Downloading https://files.pythonhosted.org/packages/76/88/e6eba0ebd8a11eb0a03392d827f0a605ad45fbb24234f7db98ca1ecb41b2/jellyfish-0.8.8.tar.gz (134kB)
[K     |████████████████████████████████| 143kB 6.5MB/s eta 0:00:01
Building wheels for collected packages: yake, segtok, jellyfish
  Building wheel for yake (setup.py) ... [?25ldone
[?25h  Created wheel for yake: filename=yake-0.4.8-py2.py3-none-any.whl size=60162 sha256=5c8d25bc3b1e075fd08f7c9556f6c7393b0a1c5ab95739f46bc23e1daa262cef
  Stored in directory: /tmp/pip-ephem-wheel-cache-sxw5vvxj/wheels/be/35/27/e4

In [19]:
def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k)


In [20]:
import yake

def get_keywords_yake(idx, docs):
    y = yake.KeywordExtractor(lan='en',          # language
                             n = 3,              # n-gram size
                             dedupLim = 0.9,     # deduplicationthresold
                             dedupFunc = 'seqm', #  deduplication algorithm
                             windowsSize = 1,
                             top = 10,           # number of keys
                             features=None)           
    
    keywords = y.extract_keywords(text)
    return keywords

idx=941
keywords = get_keywords_yake(idx, docs[idx])
print_results(idx, keywords, df)


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
('Non-negative matrix factorization', 0.0041066275750552455)
('Non-negative matrix', 0.026529705128479162)
('matrix factorization', 0.02652970512

Key phrases are repeated, and the text needs pre-processing to remove stop words

## 4. Keyphrases extraction using pke

`pke` an open source python-based keyphrase extraction toolkit. It provides an end-to-end keyphrase extraction pipeline in which each component can be easily modified or extended to develop new models.

`pke` currently implements the following keyphrase extraction models:

* Unsupervised models
  * Statistical models
    * TfIdf [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#tfidf)]
    * KPMiner [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#kpminer), [article by (El-Beltagy and Rafea, 2010)](http://www.aclweb.org/anthology/S10-1041.pdf)]
    * YAKE [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#yake), [article by (Campos et al., 2020)](https://doi.org/10.1016/j.ins.2019.09.013)]
  * Graph-based models
    * TextRank [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#textrank), [article by (Mihalcea and Tarau, 2004)](http://www.aclweb.org/anthology/W04-3252.pdf)]
    * SingleRank  [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#singlerank), [article by (Wan and Xiao, 2008)](http://www.aclweb.org/anthology/C08-1122.pdf)]
    * TopicRank [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#topicrank), [article by (Bougouin et al., 2013)](http://aclweb.org/anthology/I13-1062.pdf)]
    * TopicalPageRank [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#topicalpagerank), [article by (Sterckx et al., 2015)](http://users.intec.ugent.be/cdvelder/papers/2015/sterckx2015wwwb.pdf)]
    * PositionRank [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#positionrank), [article by (Florescu and Caragea, 2017)](http://www.aclweb.org/anthology/P17-1102.pdf)]
    * MultipartiteRank [[documentation](https://boudinfl.github.io/pke/build/html/unsupervised.html#multipartiterank), [article by (Boudin, 2018)](https://arxiv.org/abs/1803.08721)]
* Supervised models
  * Feature-based models
    * Kea [[documentation](https://boudinfl.github.io/pke/build/html/supervised.html#kea), [article by (Witten et al., 2005)](https://www.cs.waikato.ac.nz/ml/publications/2005/chap_Witten-et-al_Windows.pdf)]
    * WINGNUS [[documentation](https://boudinfl.github.io/pke/build/html/supervised.html#wingnus), [article by (Nguyen and Luong, 2010)](http://www.aclweb.org/anthology/S10-1035.pdf)]


In [21]:
!pip install git+https://github.com/boudinfl/pke.git

Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-g5apznug
  Running command git clone -q https://github.com/boudinfl/pke.git /tmp/pip-req-build-g5apznug
Building wheels for collected packages: pke
  Building wheel for pke (setup.py) ... [?25ldone
[?25h  Created wheel for pke: filename=pke-1.8.1-cp36-none-any.whl size=8764021 sha256=d894f0c9441150d05a71dc3feb676f783e657e4b9af4b3ee91799e3f360d6544
  Stored in directory: /tmp/pip-ephem-wheel-cache-v9ftfral/wheels/8d/24/54/6582e854e9e32dd6c632af6762b3a5d2f6b181c2992e165462
Successfully built pke
Installing collected packages: pke
Successfully installed pke-1.8.1


In [22]:
import pke

### 5.1  SingleRank

This model is an extension of the TextRank model that uses the number of co-occurrences to weigh edges in the graph.

In [23]:
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a SingleRank extractor.
extractor = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor.load_document(input=text,
                        language='en',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)

idx = 941
# now print the results
print("\n=====Title=====")
print(df['title'][idx])
print("\n=====Abstract=====")
print(df['abstract'][idx])
print("\n===Keywords===")
for k in keyphrases:
    print(k[0])


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
different multiplicative algorithms
non - negative matrix factorization
conventional least squares error
multiplicative factor
monotonic converge

### 5.2 TopicRank

In [24]:
import string
from nltk.corpus import stopwords

# 1. create a TopicRank extractor.
extractor = pke.unsupervised.TopicRank()

# 2. load the content of the document.
extractor.load_document(input=text)

# 3. select the longest sequences of nouns and adjectives, that do
#    not contain punctuation marks or stopwords as candidates.
pos = {'NOUN', 'PROPN', 'ADJ'}
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')
extractor.candidate_selection(pos=pos, stoplist=stoplist)

# 4. build topics by grouping candidates with HAC (average linkage,
#    threshold of 1/4 of shared stems). Weight the topics using random
#    walk, and select the first occuring candidate from each topic.
extractor.candidate_weighting(threshold=0.74, method='average')

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)

idx = 941
# now print the results
print("\n=====Title=====")
print(df['title'][idx])
print("\n=====Abstract=====")
print(df['abstract'][idx])
print("\n===Keywords===")
for k in keyphrases:
    print(k[0])


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
different multiplicative algorithms
monotonic convergence
multiplicative factor
nmf
leibler divergence
expectation
multivariate data
kullback
use