# Capstone Project 2: GPCR research trend
## Natural Language Processing of a domain specific literature

In [1]:
import pandas as pd
import numpy as np

import spacy
import nltk
import re
import unicodedata

from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import LineSentence
import itertools as it
import os

<h2> Load data

In [2]:
df = pd.read_csv('./Clean_data/clean_pub.csv')
df.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 323646 entries, 0 to 323645
Data columns (total 10 columns):
Unnamed: 0     323646 non-null int64
Id             323646 non-null int64
abstract       296557 non-null object
title          323646 non-null object
authors        323646 non-null object
journal        323646 non-null object
journal_abv    323646 non-null object
affiliation    323646 non-null object
keywords       37845 non-null object
year           323546 non-null float64
dtypes: float64(1), int64(2), object(7)
memory usage: 24.7+ MB


In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Id,abstract,title,authors,journal,journal_abv,affiliation,keywords,year
0,0,24877594,Abstract It has been well appreciated tha...,What we know and do not know about the cannab...,"['Malfitano AM', 'Basu S', 'Maresz K', 'Bifulc...",Seminars in immunology.,Semin Immunol,"\n Dipartimento di Medicina e Chirurgia, Unive...",KEYWORDS: Cannabinoid receptor 2; Endocann...,2014.0
1,1,16889837,Abstract Approximately 1% of the genome o...,Allosteric agonists of 7TM receptors: expandi...,"['Langmead CJ', 'Christopoulos A']",Trends in pharmacological sciences.,Trends Pharmacol Sci,\n Psychiatry Centre of Excellence for Drug Di...,,2006.0


<h2>Text Preprocessing:</h2>
<br/>
    <li> Combine abstarct and title to text
    <li>  Normalize text:
    <ol>
        <ol>
        <li> convert to lower case
        <li> remove excessive space
        <li> remove puncta
        <li> lemmatize text   
        <li> phrase modeling
           </ol>
        </ol>
    <li> Clean keywords
    <li> Extract affiliation

In [64]:
def lower_text(text):
    try:
        text = text.lower()
    except:
        pass
    return text

def reduce_space(text):
    """
    remove eccessive space and leave only one
    """
    try:
        text = re.sub(r' +', ' ',text)
    except:
        pass
    return text

def remove_puncta(text):
    """ 
    only retain word, greek word, numbers
    """
    try:
        pattern = r'[^a-z0-9α-ѕ\s\-]'
        text = re.sub(pattern, '', text)
    except:
        pass
    return text

nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
def lemmatize_text(text): 
    """ 
    normalize word variant forms to the same word
    no NaN data would be
    """
    try:
        text = nlp(text)
        text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    except:
        pass
    return text


In [5]:
# create a new dataframe to store the processed data
df_new = df[['Id','year']]

In [54]:
# combine abstract and title to text
df['abstract'] = df['abstract'].fillna('')
df_new['text'] = df['title'] + df['abstract']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Normalize text

In [61]:
%%time
# reduce space
# lower case
# remove puncta
# lemmatize_text

def replace_abstract(text):
    try:
        text = text.replace(' abstract ', '')
    except:
        pass
    return text

text = df_new.text.apply(lower_text)
text = text.apply(replace_abstract)
text = text.apply(reduce_space)
text = text.apply(remove_puncta)
text = text.apply(lemmatize_text)

CPU times: user 2h 59min 27s, sys: 4min 8s, total: 3h 3min 36s
Wall time: 3h 2min 41s


In [62]:
df_new['text'] = text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Normalize keywords

In [65]:
%%time
# lower case
# delete the start with keywords 
# reduce space
# lemmatize_text
def replace_keyword(text):
    try:
        text = text.replace('keywords:', '')
    except:
        pass
    return text

text = df.keywords.apply(lower_text)
text = text.apply(replace_keyword)
text = text.apply(reduce_space)
text = text.apply(lemmatize_text)
df_new['keywords'] = text

CPU times: user 5min 53s, sys: 2.11 s, total: 5min 55s
Wall time: 5min 57s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


### Extract affiliation

In [171]:
top22_pharma = ['johnson','roche','pfizer','novartis','merk','lilly','novo nordisk', 'abbvie', 'amgen','sanofi',\
               'glaxosmithkline','astrazeneca','gilead','squibb','csl','takeda','bayer','allergan','kgaa',\
                'boehringer','biogen','abbott']

def get_company(text):
    """ assign affilliation info if affiliation is one of the top22 pharmaceutical companies """
    
    for company in top22_pharma:
        if company in text:
            return company

def get_university(text):
    """ assign affiliation to university"""
    pattern = r',(.*? university)'
    
    university = re.findall(pattern, text)

    return university   

    
def extract_affiliation(text):
    """
    simplify affiliation to:
        top20 pharmaceutical companies
        university (distinguishable in english)
        others
    """

    # if it's top22 pharma
    affiliation = get_company(text)
    
    # if not top22 pharma and is a university
    if not affiliation:
        affiliation = get_university(text)
        
    return affiliation



In [172]:
%%time
text = df.affiliation.apply(lower_text)
text = text.apply(extract_affiliation)

CPU times: user 2.45 s, sys: 39.6 ms, total: 2.49 s
Wall time: 2.51 s


In [173]:
text.head()

0                                                   []
1                                      glaxosmithkline
2                      [ the johns hopkins university]
3    [ oregon national primate research center, ore...
4                             [ washington university]
Name: affiliation, dtype: object

In [174]:
df_new['affiliation'] = text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [175]:
df_new.to_csv('./Processed_data/year_text_keywords_affiliation.csv')

<h2>Text Preprocessing:</h2>
<br>
    <li> concat all text to file: corpus
    <li> transform to digram
    <li> transform to trigram
    <li> transform to fourgram
    

In [176]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import LineSentence
import itertools as it
import os

In [177]:
currentdir = os.getcwd()
def datadir(file):
    return currentdir+'/'+file
datadir('corpus.txt')

'/Users/shunling/Desktop/CapstoneProjects/NaturalLanguageProcessingGPCR/corpus.txt'

In [None]:
# create corpus
with open('corpus.txt','w') as f:
    for i in range(df_new.text.shape[0]):
        f.write(df_new.text.iloc[i])
        f.write('.')
        f.write('\n')
        
# from unigram to digram
unigrams = LineSentence(datadir('corpus.txt'))
bigram_phrases = Phrases(unigrams) # train model
bigram_model = Phraser(bigram_phrases) # build model
with open('bigrams.txt','w') as f:
    for sent in unigrams:
        bigram_sent = ' '.join(bigram_model[sent]) # apply model
        f.write(bigram_sent)
        f.write('\n')
        
# from bigram to trigram
bigrams = LineSentence(datadir('bigrams.txt'))
trigram_phrases = Phrases(bigrams)
trigram_model = Phraser(trigram_phrases)
with open('trigrams.txt','w') as f:
    for sent in bigrams:
        tri_sent = ' '.join(trigram_model[sent])
        f.write(tri_sent)
        f.write('\n')
        
# from trigram to fourgram
trigrams = LineSentence(datadir('trigrams.txt'))
fourgram_phrases = Phrases(trigrams)
fourgram_model = Phraser(fourgram_phrases)
with open('fourgrams.txt','w') as f:
    for sent in trigrams:
        four_sent = ' '.join(fourgram_model[sent])
        f.write(four_sent)
        f.write('\n')

In [22]:
'\u0405'.lower()

'ѕ'