# Capstone Project 2: GPCR research trends
## Natural language processing of a domain specific literature

In [1]:
import json
import pandas as pd
from datetime import datetime
import requests, sys, webbrowser, bs4
import re
from Bio import Entrez
import numpy as np
import glob2

## Get all NCBI literature pubmed IDs

In [4]:
def search(query):
    """
    rechieve publication pubmed ID by query
    return a dictionary with 'IdList':[all pubmed ids]
    """
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='330000',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results 

In [5]:
# search all papers with 'G protein coupled receptor' IN title or/and in abstract
results = search('G protein coupled receptor')

In [9]:
# save searched resutls in json file
with open('GPCR_pub.json', 'w') as outfile:
    json.dump(results, outfile)

In [12]:
# rechieve pubmed IDs in a list
ID_list = results['IdList']
len(ID_list),ID_list[310000]

(324859, '3755406')

## Data collection from NCBI pubmed database

In [None]:
url_stem = 'https://www.ncbi.nlm.nih.gov/pubmed/'

"""
rechieve paper information one by one according to pubmed ID

the table contain the following columns:
Id, abstract, title, authors, journal, journal abreviation, publication date, affiliation, keywords 
and a mixed raw record contain mulitiple information for future processing

the whole process took 4 days to complete

""" 
for i in range(len(ID_list)):
    if i % 10000 == 0:
        print(datetime.now())
        # save every 10,000 paper information in a table
        if i: 
            pd.DataFrame(df).to_csv('chunk'+str(i)+'.csv')
        
        # initiate another empty dataframe
        df = {'Id':[],
              'abstract':[],
              'title':[],
              'authors':[],
              'journal':[],
              'journal_abv':[],
              'date':[],
              'affiliation':[],
              'records':[],
              'keywords':[]
        }
    try: # ID
        url = url_stem + str(ID_list[i])
        r = requests.get(url)
        df['Id'].append(ID_list[i])
    except:
        f.write(str(ID_list[i]))
        f.write('\n')
        continue
        
    try: # title
        title = bs4.BeautifulSoup(r.content).select('h1')[1].prettify()
        df['title'].append(title)
    except:
        df['title'].append('NA')
    
    try: # abstract
        abstract_record = bs4.BeautifulSoup(r.content).select('div.abstr')[0].prettify()
        df['abstract'].append(abstract_record)
    except:
        df['abstract'].append('NA')
    
    try: # keyword
        keyword = bs4.BeautifulSoup(r.content).select('div.keywords')[0].prettify()
        df['keywords'].append(keyword)
    except:
        df['keywords'].append('NA')
        
    try: # author list
        author_record = bs4.BeautifulSoup(r.content).select('div.auths')[0].prettify()
        authors_raw = re.sub(r'%20', ' ', author_record)
        authors = re.findall(r'term=(.*)%5BAuthor',authors_raw)
        df['authors'].append(authors)
    except:
        df['authors'].append('NA')
    
    try: # raw record
        records = bs4.BeautifulSoup(r.content).select('div.cit')[0].prettify()
        df['records'].append(records)
    except:
        df['records'].append('NA')
    
    try: # journal full name
        journal = re.findall(r'title="(.*)"',records)[0]
        df['journal'].append(journal)
    except:
        df['journal'].append('NA')
    
    try: # journal abbreviation
        journal_abv = re.findall(r'alterm="(.*)." href',records)[0]
        df['journal_abv'].append(journal_abv)
    except:
        df['journal_abv'].append('NA')
    
    try: # publication date
        date = re.findall(r'</a>\n (.*) doi', records)[0]
        df['date'].append(date)
    except:
        df['date'].append('NA')
        
    try: # affiliation
        affiliation = bs4.BeautifulSoup(r.content).select('dd')[0].prettify()[4:-5]
        df['affiliation'].append(affiliation)
    except:
        df['affiliation'].append('NA')
        
    

## Re-organize files into different folders
## Integrate all tables together

In [None]:
"""
integrate the first 78k papers in one table
"""
file78 = glob2.glob('78k/chun*.csv')
dfa = pd.read_csv(file78[0])
for i in range(1,len(file78)):
    df1 = pd.read_csv(file78[i])
    dfa = pd.concat([dfa,df1])
dfa.to_csv('78k.csv')

"""
integrate the 78-150k papers in one table
"""
file78 = glob2.glob('150k/chun*.csv')
dfa = pd.read_csv(file78[0])
for i in range(1,len(file78)):
    df1 = pd.read_csv(file78[i])
    dfa = pd.concat([dfa,df1])
dfa.to_csv('78_150k.csv')

"""
integrate the 150-240k papers in one table
"""
file78 = glob2.glob('240k/chun*.csv')
dfa = pd.read_csv(file78[0])
for i in range(1,len(file78)):
    df1 = pd.read_csv(file78[i])
    dfa = pd.concat([dfa,df1])
dfa.to_csv('150_240k.csv')

"""
integrate the 240-320k papers in one table
"""
file78 = glob2.glob('320k/chun*.csv')
dfa = pd.read_csv(file78[0])
for i in range(1,len(file78)):
    df1 = pd.read_csv(file78[i])
    dfa = pd.concat([dfa,df1])
dfa.to_csv('240_320k.csv')