# Capstone Project 2: GPCR research trend
## Natural Language Processing of a domain specific literature
***
### Topic modeling: Convert word_class relationship from clustering to dictionary

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter

In [18]:
# clustering of words used 35 clusters
word_class = pd.read_csv('word_class35.csv',header=None).dropna()

In [11]:
word_class.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247500 entries, 0 to 247501
Data columns (total 2 columns):
0    247500 non-null object
1    247500 non-null int64
dtypes: int64(1), object(1)
memory usage: 5.7+ MB


In [19]:
# change fourgram pattern to single gram 
word = word_class[0].apply(lambda x: ' '.join(x.split('_')))
word.head()

0      what
1        we
2      know
3       and
4    do not
Name: 0, dtype: object

In [21]:
word_class.columns = ['word','class']
word_class['word'] = word

In [29]:
word_class.head()

Unnamed: 0,word,class
0,what,4
1,we,32
2,know,32
3,and,32
4,do not,32


In [25]:
word_class.to_csv('word_class.csv')

In [34]:
# change dataframe to dictionary with words as keys and topic class as values
word_topic = {}
for i in tqdm(range(word_class.shape[0])):
    word_topic[word_class.iloc[i]['word']] = word_class.iloc[i]['class']

100%|██████████| 247500/247500 [01:26<00:00, 2877.36it/s]


In [36]:
# check conversion
word_topic['we'],word_topic['what']

(32, 4)

### Load preprocessed dataframe

In [45]:
# table with id,year,text,keyword columns
df = pd.read_csv('./Processed_data/year_text_keywords_affiliation.csv').drop('Unnamed: 0',axis=1)

In [46]:
df.head()

Unnamed: 0,Id,year,text,keywords,affiliation
0,24877594,2014.0,what we know and do not know about the canna...,cannabinoid receptor 2 ; endocannabinoid ; i...,[]
1,16889837,2006.0,allosteric agonist of 7tm receptor expand th...,,glaxosmithkline
2,31068464,2019.0,mrgprx4 be a g protein - couple receptor act...,mrgprx4 ; bile acid ; cholestasis ; itch ; p...,[' the johns hopkins university']
3,30610192,2019.0,modulation of gpr39 a g - protein couple rec...,,"[' oregon national primate research center, or..."
4,30289386,2018.0,simulation of spontaneous g protein activati...,g protein ; markov state model ; allostery ;...,[' washington university']


In [48]:
fourgrams = pd.read_csv('./Processed_data/Id_fourgram.csv',index_col=0)
fourgrams.head()

Unnamed: 0,Id,fourgram
0,24877594,what we know and do_not know_about the cannabi...
1,16889837,allosteric agonist of 7tm receptor expand the ...
2,31068464,mrgprx4 be a g_protein_couple receptor activat...
3,30610192,modulation of gpr39 a g_protein_couple recepto...
4,30289386,simulation of spontaneous g_protein activation...


### Assign top10 topics to each record

In [70]:

def predict_topic(text):
    """
    return top10 topics in the text
    use word_topic as global variable
    """
    words = text.split()
    topics = Counter()
    for word in words:
        if word in word_topic:
            topics[word_topic[word]] += 1
        
    top10 = list(dict(topics.most_common(10)).keys())
    return top10    

In [71]:
%%time
topics = fourgrams['fourgram'].apply(predict_topic)


CPU times: user 34.3 s, sys: 217 ms, total: 34.6 s
Wall time: 34.9 s


In [75]:
keyword_topic = df

[32, 27, 29, 31, 4, 24, 10, 9, 17, 28]