# CONTIDIONAL FREQUENCY DISTRIBUTION


Let's look at how some specific words are used over time. In order to do this, NLTK's Conditional Frequency Distribution will be used. A conditional frequency distribution is a collection of frequency distributions, each one for a different "condition".
This code checks if the words start with either of the "targets" (words we want to see over the time). 

In [48]:
#import data
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from collections import Counter
import inflection as inf
from inflection import singularize
import re

data = pd.read_csv(r'/home/osnat/Documents/writing in progress/The Last decade of Biomaterials research /Data_Claudia/ahm/ahmfinal.csv')
df= pd.DataFrame(data)

In [49]:
df['date'] = [x[2:6] for x in df['advanced']] #create a dataframe for the date
df[:7]

Unnamed: 0,advanced,date
0,['2012 mar 22708076 fabrication of a hybrid mi...,2012
1,['2012 jul 23061030 dual imaging enabled cance...,2012
2,['2013 jan 23184367 eradicating antibiotic res...,2013
3,['2013 jan 23184402 sirna transfection with ca...,2013
4,['2013 jan 23184404 towards smart tattoos impl...,2013
5,['2013 may 23184424 physicochemical cytotoxic ...,2013
6,['2013 jan 23184425 enhanced skin adhesive pat...,2013


In [50]:
#create a column with pmids
df['pmid'] = [x[11:19] for x in df['advanced']] 
df[:7]

Unnamed: 0,advanced,date,pmid
0,['2012 mar 22708076 fabrication of a hybrid mi...,2012,22708076
1,['2012 jul 23061030 dual imaging enabled cance...,2012,23061030
2,['2013 jan 23184367 eradicating antibiotic res...,2013,23184367
3,['2013 jan 23184402 sirna transfection with ca...,2013,23184402
4,['2013 jan 23184404 towards smart tattoos impl...,2013,23184404
5,['2013 may 23184424 physicochemical cytotoxic ...,2013,23184424
6,['2013 jan 23184425 enhanced skin adhesive pat...,2013,23184425


In [51]:
#now that we have the dates in our dataframe, we clean the data by deleting all digits and puntuaction signs. 
#words with less than 3 characters will also be removed
def preprocess(text):
    text = re.sub('[^a-zA-Z ]','' , text) #remove puntuaction signs and digits 
    text = re.sub(r'\b\w{1,3}\b','' , text) #remove all words with less than 4 letters
    return text

In [52]:
df['advanced'] = df['advanced'].apply(lambda x:preprocess(x))
df[:7]

Unnamed: 0,advanced,date,pmid
0,fabrication hybrid microfluidic system in...,2012,22708076
1,dual imaging enabled cancer targeting nanop...,2012,23061030
2,eradicating antibiotic resistant biofilms w...,2013,23184367
3,sirna transfection with calcium phosphate n...,2013,23184402
4,towards smart tattoos implantable biosensor...,2013,23184404
5,physicochemical cytotoxic dermal release f...,2013,23184424
6,enhanced skin adhesive patch with modulus t...,2013,23184425


In [53]:
#create the list of stopwords
stop = stopwords.words('english')
newstopwords = ["report","summarize","review","demonstated","significantly","efficiently","appeared","loosening", "['","using","based",".","statement","significance","result","results","used","application ","release","effect","study","significant","showed","p","also","model","models"]
stop.extend(newstopwords)

In [54]:
df['advanced'] = df['advanced'].apply(lambda x: ' '.join([inf.singularize(word) for word in x.split() if word not in (stop)]))
df[:7] #check the clean text

Unnamed: 0,advanced,date,pmid
0,fabrication hybrid microfluidic system incorpo...,2012,22708076
1,dual imaging enabled cancer targeting nanopart...,2012,23061030
2,eradicating antibiotic resistant biofilm silve...,2013,23184367
3,sirna transfection calcium phosphate nanoparti...,2013,23184402
4,toward smart tattoo implantable biosensor cont...,2013,23184404
5,physicochemical cytotoxic dermal feature novel...,2013,23184424
6,enhanced skin adhesive patch modulu tunable co...,2013,23184425


In [55]:
import numpy as np
from itertools import chain

# return list from series of space strings
def chainer(s):
    return list(chain.from_iterable(s.str.split()))

# calculate lengths of splits (number of words per abstract!)
lens = df['advanced'].str.split().map(len)

In [56]:
#create new dataframewith one line per word

df = pd.DataFrame({'advanced': chainer(df['advanced']), 
                    'date': np.repeat(df['date'], lens), 
                    'pmid': np.repeat(df['pmid'], lens)
                    })

df[:10]

Unnamed: 0,advanced,date,pmid
0,fabrication,2012,22708076
0,hybrid,2012,22708076
0,microfluidic,2012,22708076
0,system,2012,22708076
0,incorporating,2012,22708076
0,lithographically,2012,22708076
0,patterned,2012,22708076
0,microchannel,2012,22708076
0,fiber,2012,22708076
0,formed,2012,22708076


In [57]:
#get rid of duplicates (= no repetition of terms within abstracts, unique per abstract)


df_u = df.drop_duplicates()

#calculate how many lines were dropped
print(len(df)-len(df_u))

72366


In [89]:
#calculate appearance per year into a new dataframe



df_cal = df_u.sort_values(['date']).groupby(['advanced', 'date']).count().reset_index()
df_cal = df_cal.pivot(index= 'advanced', columns = 'date', values = 'pmid').reset_index()

df_cal[:7]

date,advanced,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,aaghmon,,,,,,,,,1.0
1,aapc,,,,,,,,1.0,
2,abate,,,,,,1.0,,,
3,abbreviated,,,,1.0,,1.0,,2.0,1.0
4,abca,,,1.0,,,,,,
5,abdominal,,,1.0,1.0,2.0,2.0,3.0,,3.0
6,aberrant,,,,,,2.0,,,


In [93]:
#create a concise df with selected keywords from a list

term_list = ('silk', 'hydrogel', 'bone', 'bioprinting') 

df_final = df_cal.loc[df_cal['advanced'].isin(term_list)]
df_final[:7]

date,advanced,2012,2013,2014,2015,2016,2017,2018,2019,2020
1389,bioprinting,,,,1.0,7.0,9.0,14.0,8.0,21.0
1508,bone,3.0,12.0,11.0,30.0,41.0,32.0,34.0,39.0,39.0
5745,hydrogel,12.0,23.0,28.0,34.0,53.0,46.0,58.0,59.0,81.0
11766,silk,1.0,3.0,1.0,7.0,3.0,7.0,13.0,5.0,12.0
