In [107]:
import pandas as pd

data = pd.read_csv('title_abstract.csv', error_bad_lines=False);
data_text = data[['text']]
data_text['index'] = data_text.index
data_text['paper_id'] = data['paperId']
documents = data_text

In [8]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liaojinliang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

In [10]:

corpus = []
for i in range(0, len(documents)):
    #Remove punctuations
    preText = documents[documents['index'] == i].values[0][0]
    text = re.sub('[^a-zA-Z]', ' ', preText)
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [11]:
corpus[222]

'understanding xcp equilibrium fairness prove xcp equilibrium solves constrained max min fairness problem identifying unique solution hierarchy optimization problem namely solved max min fair allocation solved xcp additional constraint describe algorithm compute equilibrium derive lower upper bound link utilization xcp reduces max min allocation single link network additional constraint cause flow receive arbitrarily small fraction max min allocation present simulation result confirm analytical finding ieee'

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [15]:
list(cv.vocabulary_.keys())[:10]

['strategy',
 'managing',
 'content',
 'complexity',
 'algorithm',
 'animation',
 'computer',
 'excellent',
 'medium',
 'capturing']

In [16]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]

In [20]:
top_words = get_top_n_words(corpus, n=50)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]

In [21]:
print(top_df)

           Word  Freq
0          data  3966
1         based  2973
2        system  2802
3         model  2788
4         paper  2336
5        method  2266
6          user  2216
7      approach  2164
8     algorithm  1953
9          time  1818
10        cloud  1814
11  application  1763
12  information  1758
13      process  1724
14       result  1607
15      network  1558
16        study  1549
17      service  1541
18          use  1522
19     resource  1471
20         used  1337
21  performance  1318
22    technique  1290
23      problem  1255
24     analysis  1253
25      present  1250
26     research  1195
27        query  1160
28    different  1092
29   technology  1077
30     proposed  1018
31          set  1017
32    computing  1006
33      however   979
34         task   968
35       design   948
36      propose   946
37         ieee   936
38      support   872
39         work   865
40       number   861
41         real   856
42       social   848
43         cost   793
44     exi

In [25]:
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),  
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top2_words = get_top_n2_words(corpus, n=50)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)

                     Bi-gram  Freq
0              paper present   425
1              process model   412
2            cloud computing   408
3            springer verlag   359
4             right reserved   302
5                  state art   285
6                  real time   251
7                   data set   236
8              paper propose   231
9                 case study   230
10          business process   218
11               data center   215
12       experimental result   211
13         berlin heidelberg   204
14             copyright acm   202
15                real world   194
16                  big data   188
17             verlag berlin   176
18          machine learning   160
19              visual field   156
20        energy consumption   152
21           virtual machine   151
22    springer international   149
23  international publishing   149
24             mobile device   147
25            sensor network   144
26     information retrieval   130
27         anomaly d

In [26]:
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3), 
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=50)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)

                                 Tri-gram  Freq
0                  springer verlag berlin   176
1                verlag berlin heidelberg   176
2       springer international publishing   149
3                          john wiley son   104
4                       held owner author    91
5                           wiley son ltd    90
6                    copyright held owner    88
7               springer science business    83
8                 science business medium    83
9                 wireless sensor network    82
10        association computing machinery    72
11                elsevier right reserved    71
12                 business process model    69
13                   copyright john wiley    68
14                    quality service qos    68
15   international publishing switzerland    68
16            international publishing ag    67
17                 support vector machine    63
18                     ltd right reserved    62
19            cloud computing environmen

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()


In [108]:
def getPaperID(index):
    id = documents[documents['index'] == index].values[0][2]
    return id[:-1];

In [55]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

# fetch document for which keywords needs to be extracted


In [79]:
def getKeyWords(index):
    kws = list()
    doc=corpus[index]
    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    # now print the results
    for k in keywords:
        if(keywords[k] > 0.20):
            kws.append(k)
    
    return kws
    

In [80]:
print(getPaperID(128),getKeyWords(128))


35421333 ['tree', 'splay', 'binary', 'search tree']


In [151]:
import couchdb
import csv

couch=couchdb.Server("http://admin:password@localhost:5984")
try:
    database=couch["paperinfo_scopus"]
except:
    print("wrong db name")

In [124]:
import json
def updateDoc(paperid, keywords):
    doc = database.get(str(paperid))
    doc["keyword"] = keywords
    doc = database.save(doc)

In [123]:
for i in range(0,len(documents)):
    updateDoc(getPaperID(i),getKeyWords(i))
print("complete" +len(documents) +"keywords")

In [121]:
print(database.get(str("0001919357")))

<Document '0001919357'@'5-a9539b7eb11c2ed62786b14ebe691ae7' {'title': 'Guidelines for Presentation and Comparison of Indexing Techniques', 'abstract': 'Descriptions of new indexing techniques are a common outcome of database research, but these descriptions are sometimes marred by poor methodology and a lack of comparison to other schemes. In this paper we describe a framework for presentation and comparison of indexing schemes that we believe sets a minimum standard for development and dissemination of research results in this area.', 'coverDate': '1996-01-01', 'coverDateYear': '1996', 'cite_count': '33', 'paper_type': 'Review', 'CISAuthors': '35586971600,56891817800,7003595103', 'co_author': ['7003595103', '35586971600', '56891817800'], 'type': 'Paper', 'keyword': ['indexing', 'comparison', 'presentation', 'description', 'scheme']}>


In [127]:
doc_keywords = []
for i in range(0,len(documents)):
    tup = {"id":getPaperID(i),"kw":getKeyWords(i)}
    doc_keywords.append(tup)


In [128]:
print(doc_keywords[0:10])

[{'id': '0000036988', 'kw': ['animation', 'level detail', 'detail', 'student', 'level']}, {'id': '0000764262', 'kw': ['passage', 'ranking', 'document']}, {'id': '0000891764', 'kw': ['deductive', 'relational', 'deductive database', 'logical', 'database']}, {'id': '0001104487', 'kw': ['coding', 'block', 'operation']}, {'id': '0001624306', 'kw': ['document', 'memory', 'ranking', 'array', 'length', 'inverted file']}, {'id': '0001790521', 'kw': ['compression', 'binary', 'image']}, {'id': '0001825807', 'kw': ['compiler', 'mix', 'evaluator', 'partial']}, {'id': '0001919357', 'kw': ['indexing', 'comparison', 'presentation', 'description', 'scheme']}, {'id': '0002124265', 'kw': ['join algorithm', 'join', 'buffer', 'algorithm', 'hash']}, {'id': '0002848777', 'kw': ['similarity', 'measure', 'ranked', 'ranked query', 'query']}]


In [131]:
topic_table = {}

In [150]:
for i in topic_table:
    print(topic_table[i])

[29, 45, 46, 56, 77, 84, 92, 110, 142, 165, 183, 212, 273, 280, 375, 381, 382, 405, 408, 452, 535, 537, 545, 574, 613, 621, 649, 668, 674, 784, 850, 890, 958, 975, 982, 1004, 1023, 1024, 1025, 1038, 1064, 1066, 1113, 1121, 1146, 1147, 1172, 1186, 1240, 1241, 1249, 1260, 1274, 1276, 1282, 1307, 1364, 1409, 1473, 1488, 1497, 1508, 1583, 1688, 1786, 1788, 1800, 1811, 1823, 1907, 1921, 1949, 1953, 1969, 2022, 2049, 2052, 2070, 2129, 2151, 2180, 2197, 2261, 2275, 2319, 2373, 2393, 2435, 2449, 2509, 2516, 2530, 2598, 2619, 2648, 2654, 2678, 2699, 2716, 2720, 2751, 2776, 2819, 2908, 2939, 2965, 3026, 3039, 3075, 3097, 3121, 3137, 3179, 3207, 3247, 3270, 3280, 3331, 3335, 3339, 3418, 3455, 3572, 3624, 3636, 3677, 3679, 3701, 3739, 3740, 3751, 3781, 3822, 3968]
[140, 254, 315, 320, 329, 345, 351, 360, 366, 384, 387, 434, 495, 518, 554, 612, 614, 622, 672, 710, 755, 762, 764, 768, 777, 795, 809, 814, 815, 919, 920, 922, 934, 947, 950, 954, 970, 976, 983, 1021, 1054, 1087, 1108, 1139, 1145, 1154,

In [153]:
from collections import Counter

In [154]:
getPaperID(29)

'0023567672'

In [155]:
keyword_table = {}
for i in topic_table:
    keyword_table[i] = Counter()

In [156]:
for i in topic_table:
    for index in topic_table[i]:
        id = getPaperID(index)
        kws = doc_keywords[index]["kw"]
        for kw in kws:
            keyword_table[i][kw]+=1
        

In [206]:
for i in keyword_table:
    print("topic :",i)
    mostcommon = keyword_table[i].most_common(15)
    kw = list()
    for i in mostcommon:
        kw.append(i[0])
    print(kw)

topic : 0
['sequence', 'sensor', 'mutation', 'protein', 'code', 'correlation', 'sensor network', 'family', 'sense', 'data', 'word', 'network', 'binary', 'linear', 'ring']
topic : 1
['clustering', 'sequence', 'interaction', 'library', 'community', 'exertion', 'family', 'gesture', 'protein', 'process', 'social', 'game', 'student', 'process model', 'graph']
topic : 2
['health', 'security', 'social', 'election', 'digital', 'medium', 'information', 'voting', 'technology', 'library', 'data', 'workshop', 'privacy', 'play', 'human']
topic : 3
['cloud', 'energy', 'public', 'public display', 'display', 'agent', 'game', 'voting', 'voter', 'data center', 'election', 'center', 'consolidation', 'player', 'energy consumption']
topic : 4
['query', 'document', 'location', 'compression', 'search', 'topic', 'index', 'tree', 'collection', 'object', 'model', 'user', 'term', 'relevance', 'poi']
topic : 5
['sequence', 'complementary', 'complementary pair', 'alphabet', 'arithmetic', 'completion', 'aperiodic',

In [202]:
ind =14
keyword_table[ind] = Counter()

In [203]:
topic_table[ind] = [153, 205, 285, 475, 591, 625, 835, 884, 990, 1035, 1073, 1107, 1115, 1168, 1173, 1184, 1222, 1296, 1336, 1553, 1606, 1612, 1671, 1762, 1763, 1783, 1785, 1845, 1914, 1969, 2054, 2117, 2211, 2315, 2351, 2366, 2484, 2504, 2538, 2573, 2836, 2839, 2847, 2975, 3053, 3219, 3235, 3259, 3314, 3330, 3472, 3498, 3502, 3503, 3743, 3818, 3880]
for index in topic_table[ind]:
        id = getPaperID(index)
        kws = doc_keywords[index]["kw"]
        for kw in kws:
            keyword_table[ind][kw]+=1

In [204]:
print(keyword_table[ind].most_common(20))

[('feedback', 9), ('surgical', 9), ('gene', 7), ('feature', 7), ('bone', 5), ('selection', 4), ('feature selection', 4), ('classifier', 4), ('xc', 4), ('training', 4), ('temporal bone', 4), ('simulator', 4), ('navigation', 3), ('function', 3), ('class', 3), ('gene expression', 3), ('expression', 3), ('cancer', 3), ('classification', 3), ('user', 3)]
