In [None]:
import numpy as np 
import pandas as pd 
import spacy
from wordcloud import WordCloud
import pickle

In [None]:
data=pd.read_csv("./india-news-headlines.csv")
data=data[['publish_date','headline_text']].drop_duplicates()
data['publish_date']=pd.to_datetime(data['publish_date'],format="%Y%M%d")
data['year']=data['publish_date'].dt.year
nlp= spacy.load('en')

In [None]:
#The following code takes a really long time, so there's pickled versions of those files

### Get imp words by year
import sklearn.feature_extraction.text as text
def get_imp(bow,mf,ngram):
    tfidf=text.CountVectorizer(bow,ngram_range=(ngram,ngram),max_features=mf,stop_words='english')
    matrix=tfidf.fit_transform(bow)
    return pd.Series(np.array(matrix.sum(axis=0))[0],index=tfidf.get_feature_names()).sort_values(ascending=False).head(100)


### Global trends
bow=data['headline_text'].tolist()
total_data=get_imp(bow,mf=5000,ngram=1)
total_data_bigram=get_imp(bow=bow,mf=5000,ngram=2)
total_data_trigram=get_imp(bow=bow,mf=5000,ngram=3)


### Yearly trends
imp_terms_unigram={}
for y in data['year'].unique():
    bow=data[data['year']==y]['headline_text'].tolist()
    imp_terms_unigram[y]=get_imp(bow,mf=5000,ngram=1)
imp_terms_bigram={}
for y in data['year'].unique():
    bow=data[data['year']==y]['headline_text'].tolist()
    imp_terms_bigram[y]=get_imp(bow,mf=5000,ngram=2)
imp_terms_trigram={}
for y in data['year'].unique():
    bow=data[data['year']==y]['headline_text'].tolist()
    imp_terms_trigram[y]=get_imp(bow,mf=5000,ngram=3)

In [None]:
total_data = './Pickle files/total_data.pkl'
outfile = open(filename,'wb')
pickle.dump(total_data,outfile)
outfile.close()

filename = './Pickle files/total_data_bigram.pkl'
outfile = open(filename,'wb')
pickle.dump(total_data_bigram,outfile)
outfile.close()

filename = './Pickle files/total_data_trigram.pkl'
outfile = open(filename,'wb')
pickle.dump(total_data_trigram,outfile)
outfile.close()

filename = './Pickle files/imp_terms_unigram.pkl'
outfile = open(filename,'wb')
pickle.dump(imp_terms_unigram,outfile)
outfile.close()

filename = './Pickle files/imp_terms_bigram.pkl'
outfile = open(filename,'wb')
pickle.dump(imp_terms_bigram,outfile)
outfile.close()

filename = './Pickle files/imp_terms_trigram.pkl'
outfile = open(filename,'wb')
pickle.dump(imp_terms_trigram,outfile)
outfile.close()

In [None]:
total_data=pd.read_pickle('./Pickle files/total_data.pkl')
total_data_bigram=pd.read_pickle("./Pickle files/total_data_bigram.pkl")
total_data_trigram=pd.read_pickle("./Pickle files/total_data_trigram.pkl")

f=open("./Pickle files/imp_terms_unigram.pkl","rb")
d=f.read()
imp_terms_unigram=pickle.loads(d)
f.close()
f=open("./Pickle files/imp_terms_bigram.pkl","rb")
d=f.read()
imp_terms_bigram=pickle.loads(d)
f.close()
f=open("./Pickle files/imp_terms_trigram.pkl","rb")
d=f.read()
imp_terms_trigram=pickle.loads(d)
f.close()

In [None]:
### Common unigrams across all the years
common_unigram={}
for y in np.arange(2001,2017,1):
    if y==2001:       
        common_unigram[y]=set(imp_terms_unigram[y].index).intersection(set(imp_terms_unigram[y+1].index))
    else:
        common_unigram[y]=common_unigram[y-1].intersection(set(imp_terms_unigram[y+1].index))

        

### Common bigrams across all the years
common_bigram={}
for y in np.arange(2001,2017,1):
    if y==2001:
         common_bigram[y]=set(imp_terms_bigram[y].index).intersection(set(imp_terms_bigram[y+1].index))
    else:
        common_bigram[y]=common_bigram[y-1].intersection(set(imp_terms_bigram[y+1].index))


### Common trigrams, 1 year window
common_trigram_1yr={}
for y in np.arange(2001,2017,1):
    common_trigram_1yr[str(y)+"-"+str(y+1)]=set(imp_terms_trigram[y].index).intersection(set(imp_terms_trigram[y+1].index))

    
### Commin trigrams, 2 year window
common_trigram_2yr={}
for y in np.arange(2001,2015,3):
    if y==2001:
        common_trigram_2yr[str(y)+"-"+str(y+1)+"-"+str(y+2)]=set(imp_terms_trigram[y].index).intersection(set(imp_terms_trigram[y+1].index)).intersection(set(imp_terms_trigram[y+2].index))
    else:
        common_trigram_2yr[str(y)+"-"+str(y+1)+"-"+str(y+2)]=set(imp_terms_trigram[y].index).intersection(set(imp_terms_trigram[y+1].index)).intersection(set(imp_terms_trigram[y+2].index))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.subplot(1,3,1)
total_data.head(20).plot(kind="bar",figsize=(25,10),colormap='Set2')
plt.title("Unigrams",fontsize=30)
plt.yticks([])
plt.xticks(size=20)
plt.subplot(1,3,2)
total_data_bigram.head(20).plot(kind="bar",figsize=(25,10),colormap='Set2')
plt.title("Bigrams",fontsize=30)
plt.yticks([])
plt.xticks(size=20)
plt.subplot(1,3,3)
total_data_trigram.head(20).plot(kind="bar",figsize=(25,10),colormap='Set2')
plt.title("Trigrams",fontsize=30)
plt.yticks([])
plt.xticks(size=20)

In [None]:
for i in range(1,18,1):
    plt.subplot(9,2,i)
    imp_terms_bigram[2000+i].head(5).plot(kind="barh",figsize=(20,25),colormap='Set2')
    plt.title(2000+i,fontsize=20)
    plt.xticks([])
    plt.yticks(size=20,rotation=5)

In [None]:
for i in range(1,18,1):
    plt.subplot(9,2,i)
    imp_terms_trigram[2000+i].head(5).plot(kind="barh",figsize=(20,25),colormap="Set2")
    plt.title(2000+i,fontsize=20)
    plt.xticks([])
    plt.yticks(size=15,rotation=5)

In [None]:
## Count of common tokens across the years
count_common_bi={}
for year in range(2001,2017,1):
    count_common_bi[year]=pd.Series()
    for word in common_bigram[year]:
        if year==2001:
            count_common_bi[year][word]=imp_terms_bigram[year][word]+imp_terms_bigram[year+1][word]
        else:
            count_common_bi[year][word]=count_common_bi[year-1][word]+imp_terms_bigram[year+1][word]

In [None]:
for i in range(1,17,1):
    plt.subplot(9,2,i)
    count_common_bi[2000+i].sort_values(ascending=False).head(10).plot(kind="barh",figsize=(20,35),colormap="Set2")
    if (2000+i)==2001:
        plt.title(str(2000+i)+"-"+str(2000+i+1),fontsize=30)
    else:
        plt.title("upto-"+str(2000+i+1),fontsize=30)
    plt.xticks([])
    plt.yticks(size=20,rotation=5)

In [None]:
## Story of 'year old'
index=data['headline_text'].str.match(r'(?=.*\byear\b)(?=.*\bold\b).*$')
texts=data['headline_text'].loc[index].tolist()
noun=[]
verb=[]
for doc in nlp.pipe(texts,n_threads=16,batch_size=10000):
    try:
        for c in doc:
            if c.pos_=="NOUN":
                noun.append(c.text)
            elif c.pos_=="VERB":
                verb.append(c.text)            
    except:
        noun.append("")
        verb.append("")

In [None]:
plt.subplot(1,2,1)
pd.Series(noun).value_counts().head(10).plot(kind="bar",figsize=(20,5),colormap="Set2")
plt.title("Top 10 Nouns in context of 'Year Old'",fontsize=30)
plt.xticks(size=20,rotation=80)
plt.yticks([])
plt.subplot(1,2,2)
pd.Series(verb).value_counts().head(10).plot(kind="bar",figsize=(20,5),colormap="Set2")
plt.title("Top 10 Verbs in context of 'Year Old'",fontsize=30)
plt.xticks(size=20,rotation=80)
plt.yticks([])

In [None]:
data['headline_text'].loc[index].tolist()[0:20]

In [None]:
index_s=data['headline_text'].str.match(r'(?=.*\bcommits\b)(?=.*\bsuicide\b).*$')
text_s=data['headline_text'].loc[index].tolist()
noun_s=[]
for doc in nlp.pipe(text_s,n_threads=16,batch_size=1000):
    try:
        for c in doc:
            if c.pos_=='NOUN':
                noun_s.append(c.text)
    except:
        for c in doc:
            noun_s.append("")

In [None]:
pd.Series(noun_s).value_counts().head(20).plot("bar",figsize=(15,5),colormap="Set2")
plt.xticks(fontsize=20)
plt.yticks([])
plt.ylabel("Frequency")
plt.title("Frequency of Nouns in the context of 'Commits Suicide'",fontsize=30)

In [None]:
index_s=data['headline_text'].str.match(r'(?=.*\bcommits\b)(?=.*\bsuicide\b).*$',case=False)
index_farmer=data.loc[index_s]['headline_text'].str.match(r'farmer',case=False)
index_stu=data.loc[index_s]['headline_text'].str.match(r'student',case=False)

In [None]:
print("Approximately {} percent of suicides reported were student related".format(round(np.sum(index_stu)/np.sum(index_s),2)*100))

In [None]:
print("Approximately {} percent of suicides reported were farmer related".format(round(np.sum(index_farmer)/np.sum(index_s),2)*100))

In [None]:
ind_farmer=data['headline_text'].str.match(r'farmer|farmers',case=False)

In [None]:
text_f=data.loc[ind_farmer]['headline_text'].tolist()
noun_f=[]
verb_f=[]
for doc in nlp.pipe(text_f,n_threads=16,batch_size=1000):
    try:
        for c in doc:
            if c.pos_=='NOUN':
                noun_f.append(c.text)
            elif c.pos_=="VERB":
                verb_f.append(c.text)
    except:
        for c in doc:
            noun_f.append("") 
            verb_f.append("")

In [None]:
plt.subplot(1,2,1)
pd.Series(noun_f).value_counts()[2:].head(10).plot(kind="bar",figsize=(20,5),colormap="Set2")
plt.title("Top 10 Nouns in the context of 'Farmer(s)'",fontsize=25)
plt.xticks(size=20,rotation=80)
plt.yticks([])
plt.subplot(1,2,2)
pd.Series(verb_f).value_counts().head(10).plot(kind="bar",figsize=(20,5),colormap="Set2")
plt.title("Top 10 Verbs in the context of 'Farmer(s)'",fontsize=25)
plt.xticks(size=20,rotation=80)
plt.yticks([])