In [1]:
import pandas as pd
# load the dataset
df = pd.read_csv('papers.csv')
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
id            7241 non-null int64
year          7241 non-null int64
title         7241 non-null object
event_type    2422 non-null object
pdf_name      7241 non-null object
abstract      7241 non-null object
paper_text    7241 non-null object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [3]:
print("{} abstracts are missing".format(df[df['abstract']=='Abstract Missing']['abstract'].count()))

3317 abstracts are missing


In [6]:
import pprint
sample = 941
pprint.pprint("ABSTRACT:{}".format(df['abstract'][sample]))

('ABSTRACT:Non-negative matrix factorization (NMF) has previously been shown '
 'to \r\n'
 'be a useful decomposition for multivariate data. Two different multi- \r\n'
 'plicative algorithms for NMF are analyzed. They differ only slightly in \r\n'
 'the multiplicative factor used in the update rules. One algorithm can be \r\n'
 'shown to minimize the conventional least squares error while the other \r\n'
 'minimizes the generalized Kullback-Leibler divergence. The monotonic \r\n'
 'convergence of both algorithms can be proven using an auxiliary func- \r\n'
 'tion analogous to that used for proving convergence of the Expectation- \r\n'
 'Maximization algorithm. The algorithms can also be interpreted as diag- \r\n'
 'onally rescaled gradient descent, where the rescaling factor is '
 'optimally \r\n'
 'chosen to ensure convergence. ')


In [7]:
pprint.pprint("FULL TEXT:{}".format(df['paper_text'][sample][:1000]))

('FULL TEXT:Algorithms for Non-negative Matrix\n'
 'Factorization\n'
 '\n'
 'Daniel D. Lee*\n'
 '*BelJ Laboratories\n'
 'Lucent Technologies\n'
 'Murray Hill, NJ 07974\n'
 '\n'
 'H. Sebastian Seung*t\n'
 'tDept. of Brain and Cog. Sci.\n'
 'Massachusetts Institute of Technology\n'
 'Cambridge, MA 02138\n'
 '\n'
 'Abstract\n'
 'Non-negative matrix factorization (NMF) has previously been shown to\n'
 'be a useful decomposition for multivariate data. Two different '
 'multiplicative algorithms for NMF are analyzed. They differ only slightly '
 'in\n'
 'the multiplicative factor used in the update rules. One algorithm can be\n'
 'shown to minimize the conventional least squares error while the other\n'
 'minimizes the generalized Kullback-Leibler divergence. The monotonic\n'
 'convergence of both algorithms can be proven using an auxiliary function '
 'analogous to that used for proving convergence of the '
 'ExpectationMaximization algorithm. The algorithms can also be interpreted as '
 'd

In [8]:
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [9]:
%%time
docs = df['paper_text'].apply(lambda x:pre_process(x))

Wall time: 3min 5s


In [10]:
docs[1][0:103]

'mean field theory layer visual cortex application artificial neural network christopher scofield center'

In [11]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
#docs = docs.tolist()
#create a vocabulary of words, 
cv=CountVectorizer(max_df=0.95,         # ignore words that appear in 95% of documents
                   max_features=10000,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
word_count_vector=cv.fit_transform(docs)

Wall time: 3min 11s


In [12]:
%%time
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

Wall time: 748 ms
Parser   : 134 ms


TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [13]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [14]:
# get feature names
feature_names=cv.get_feature_names()

def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [15]:
idx=941
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
Algorithms for Non-negative Matrix Factorization

=====Abstract=====
Non-negative matrix factorization (NMF) has previously been shown to 
be a useful decomposition for multivariate data. Two different multi- 
plicative algorithms for NMF are analyzed. They differ only slightly in 
the multiplicative factor used in the update rules. One algorithm can be 
shown to minimize the conventional least squares error while the other 
minimizes the generalized Kullback-Leibler divergence. The monotonic 
convergence of both algorithms can be proven using an auxiliary func- 
tion analogous to that used for proving convergence of the Expectation- 
Maximization algorithm. The algorithms can also be interpreted as diag- 
onally rescaled gradient descent, where the rescaling factor is optimally 
chosen to ensure convergence. 

===Keywords===
update rule 0.344
update 0.285
auxiliary 0.212
non negative matrix 0.21
negative matrix 0.209
rule 0.192
nmf 0.183
multiplicative 0.175