# Capstone Project 2: GPCR research trend
## Natural Language Processing of a domain specific literature

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import re

## Integrate all data

In [3]:
df1 = pd.read_csv('./rawdata/78k.csv')
df2 = pd.read_csv('./rawdata/78_150k.csv')
df3 = pd.read_csv('./rawdata/150_240k.csv')
df4 = pd.read_csv('./rawdata/240_320k.csv')
dfs = pd.concat([df1,df2,df3,df4])
dfs.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 324059 entries, 0 to 84122
Data columns (total 10 columns):
Id             324059 non-null int64
abstract       296782 non-null object
title          323870 non-null object
authors        323871 non-null object
journal        324059 non-null object
journal_abv    324059 non-null object
date           116545 non-null object
affiliation    323922 non-null object
records        323647 non-null object
keywords       37883 non-null object
dtypes: int64(1), object(9)
memory usage: 27.2+ MB


In [5]:
dfs.head(3)

Unnamed: 0,Id,abstract,title,authors,journal,journal_abv,date,affiliation,records,keywords
0,24877594,"<div class=""abstr"">\n <h3>\n Abstract\n </h3>...",<h1>\n What we know and do not know about the ...,"['Malfitano AM', 'Basu S', 'Maresz K', 'Bifulc...",Seminars in immunology.,Semin Immunol,2014 Oct;26(5):369-79.,"\n Dipartimento di Medicina e Chirurgia, Unive...","<div class=""cit"">\n <a abstractlink=""yes"" alse...","<div class=""keywords"">\n <h4>\n KEYWORDS:\n <..."
1,16889837,"<div class=""abstr"">\n <h3>\n Abstract\n </h3>...",<h1>\n Allosteric agonists of 7TM receptors: e...,"['Langmead CJ', 'Christopoulos A']",Trends in pharmacological sciences.,Trends Pharmacol Sci,,\n Psychiatry Centre of Excellence for Drug Di...,"<div class=""cit"">\n <a abstractlink=""yes"" alse...",
2,31068464,"<div class=""abstr"">\n <h3>\n Abstract\n </h3>...",<h1>\n MRGPRX4 is a G protein-coupled receptor...,"['Meixiong J', 'Vasavda C', 'Snyder SH', 'Dong...",Proceedings of the National Academy of Science...,Proc Natl Acad Sci U S A,2019 May 21;116(21):10525-10530.,\n Solomon H. Snyder Department of Neuroscienc...,"<div class=""cit"">\n <a abstractlink=""yes"" alse...","<div class=""keywords"">\n <h4>\n KEYWORDS:\n <..."


## Data Cleaning:

In [4]:
# drop rows with no record or title information
df = dfs.dropna(subset=['records','title'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 323646 entries, 0 to 84122
Data columns (total 10 columns):
Id             323646 non-null int64
abstract       296557 non-null object
title          323646 non-null object
authors        323646 non-null object
journal        323646 non-null object
journal_abv    323646 non-null object
date           116416 non-null object
affiliation    323646 non-null object
records        323646 non-null object
keywords       37845 non-null object
dtypes: int64(1), object(9)
memory usage: 27.2+ MB


In [47]:
"""
Cleaning procedure: 
    strip html tags
    get year from record
    
"""

def strip_html_tags(text):
    if type(text) != str:
        return # if no content, deal with NaN
    
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text.replace('\n','')

def get_year_record(record):
    """
    get publication missing date from record
    this will change record column to publication date
    there's no NaN record
    """
    try:
        date = date = re.findall(r'</a>\n (.*) doi', records)[0]
    except:
        date = re.findall(r'</a>\n(.*)\n</div>',record)[0]
    
    if len(date) > 3:
        dates = date.split(';')
        for x in dates[0].split():
            try:
                year = str(int(x))
                return year
            except:
                continue
    
    


In [6]:
%%time
# strip html tags
df.abstract = df.abstract.apply(strip_html_tags)
df.title = df.title.apply(strip_html_tags)
df.keywords = df.keywords.apply(strip_html_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


CPU times: user 2min 20s, sys: 764 ms, total: 2min 21s
Wall time: 2min 21s


In [54]:
# get year data from record and create another column called 'year'
records = df.records.apply(get_year_record)

In [56]:
df['year'] = records
df.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


<class 'pandas.core.frame.DataFrame'>
Int64Index: 323646 entries, 0 to 84122
Data columns (total 11 columns):
Id             323646 non-null int64
abstract       296557 non-null object
title          323646 non-null object
authors        323646 non-null object
journal        323646 non-null object
journal_abv    323646 non-null object
date           116416 non-null object
affiliation    323646 non-null object
records        323646 non-null object
keywords       37845 non-null object
year           323546 non-null object
dtypes: int64(1), object(10)
memory usage: 29.6+ MB


In [58]:
df = df.drop(['date','records'],axis=1)

In [59]:
df.to_csv('./Clean_data/clean_pub.csv')

### Check outcome after cleaning

In [61]:
year = df.year.dropna()
year.min(),year.max()

('1946', '2019')

In [62]:
df.head(2)

Unnamed: 0,Id,abstract,title,authors,journal,journal_abv,affiliation,keywords,year
0,24877594,Abstract It has been well appreciated tha...,What we know and do not know about the cannab...,"['Malfitano AM', 'Basu S', 'Maresz K', 'Bifulc...",Seminars in immunology.,Semin Immunol,"\n Dipartimento di Medicina e Chirurgia, Unive...",KEYWORDS: Cannabinoid receptor 2; Endocann...,2014
1,16889837,Abstract Approximately 1% of the genome o...,Allosteric agonists of 7TM receptors: expandi...,"['Langmead CJ', 'Christopoulos A']",Trends in pharmacological sciences.,Trends Pharmacol Sci,\n Psychiatry Centre of Excellence for Drug Di...,,2006
