# Hypothesis / target to achieve in this project

1. Look at the top 50 mentions in the dataset
2. Find the top 50 most frequently used hashtags
3. Find the sentence having the mations of pm
4. Use the prepositions to extract relevant information from the tweetseets

# Load the Data

In [34]:
import pandas as pd
import spacy
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
df= pd.read_csv("Demonatization_tweets.csv",encoding = "ISO-8859-1")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text
0,1,RT @rssurjewala: Critical question: Was PayTM ...
1,2,RT @Hemant_80: Did you vote on #Demonetization...
2,3,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,4,RT @ANI_news: Gurugram (Haryana): Post office ...
4,5,RT @satishacharya: Reddy Wedding! @mail_today ...


In [4]:
text = "RT @rssurjewala: Critical question: Was PayTM informed about #Demonetization edict by PM? It's clearly fishy and requires full disclosure &amp;…"

rt_remove = re.sub('RT',' ',text)

In [5]:
rt_remove 

"  @rssurjewala: Critical question: Was PayTM informed about #Demonetization edict by PM? It's clearly fishy and requires full disclosure &amp;…"

In [6]:
df['text']=df['text'].apply(lambda x: re.sub('RT ','',x))
df['text']=df['text'].apply(lambda x: re.sub('&amp ','&',x))
df['text']=df['text'].apply(lambda x: re.sub('<U\+[A-Z0-9]+>','',x))
df['text']=df['text'].apply(lambda x: re.sub('<ed>+','',x))

In [7]:
df

Unnamed: 0.1,Unnamed: 0,text
0,1,@rssurjewala: Critical question: Was PayTM inf...
1,2,@Hemant_80: Did you vote on #Demonetization on...
2,3,"@roshankar: Former FinSec, RBI Dy Governor, CB..."
3,4,@ANI_news: Gurugram (Haryana): Post office emp...
4,5,@satishacharya: Reddy Wedding! @mail_today car...
...,...,...
14935,14936,@saxenavishakha: Ghost of demonetization retur...
14936,14937,N d modi fans-d true nationalists of the count...
14937,14938,@bharat_builder: Lol. Demonetization has fixed...
14938,14939,@Stupidosaur: @Vidyut B team of BJP. CIA baby....


# ________________________________________________________________________________________

# 1. Look at the top 50 mentions in the dataset 

In [8]:
def mention(x):
    found=re.findall(r'@\w+',x)
    if found:
        return found
    return None

In [9]:
arr=df['text'].apply(lambda x : mention(x))

In [10]:
arr

0                       [@rssurjewala]
1                         [@Hemant_80]
2                         [@roshankar]
3                          [@ANI_news]
4        [@satishacharya, @mail_today]
                     ...              
14935                [@saxenavishakha]
14936                             None
14937                [@bharat_builder]
14938          [@Stupidosaur, @Vidyut]
14939                        [@Vidyut]
Name: text, Length: 14940, dtype: object

In [11]:
mentions_arr=[]

for x in arr:
    if x != None:
        mentions_arr.extend(x)

In [12]:
mentions_arr[:50]

['@rssurjewala',
 '@Hemant_80',
 '@roshankar',
 '@ANI_news',
 '@satishacharya',
 '@mail_today',
 '@DerekScissors1',
 '@ambazaarmag',
 '@gauravcsawant',
 '@Joydeep_911',
 '@sumitbhati2002',
 '@narendramodi',
 '@narendramodi',
 '@Joydas',
 '@Jaggesh2',
 '@Atheist_Krishna',
 '@sona2905',
 '@Dipankar_cpiml',
 '@roshankar',
 '@Atheist_Krishna',
 '@pGurus1',
 '@roshankar',
 '@Hemant_80',
 '@roshankar',
 '@Atheist_Krishna',
 '@MahikaInfra',
 '@narendramodi',
 '@Swamy39',
 '@Hemant_80',
 '@roshankar',
 '@kapil_kausik',
 '@roshankar',
 '@kapil_kausik',
 '@AAPVind',
 '@naam_pk',
 '@Hemant_80',
 '@Gadgets360',
 '@_avenu',
 '@DonMu',
 '@DrGPradhan',
 '@minimathur',
 '@mayankjain100',
 '@kapil_kausik',
 '@Atheist_Krishna',
 '@arunjaitley',
 '@narendramodi',
 '@kanimozhi',
 '@roshankar',
 '@DrGPradhan',
 '@minimathur']

# Find the top 50 most frequently used hashtags

In [13]:
df['hashtags']=df['text'].apply(lambda x: re.findall('#\w+', x))

In [14]:
df[['text','hashtags']].head()

Unnamed: 0,text,hashtags
0,@rssurjewala: Critical question: Was PayTM inf...,[#Demonetization]
1,@Hemant_80: Did you vote on #Demonetization on...,[#Demonetization]
2,"@roshankar: Former FinSec, RBI Dy Governor, CB...",[#Demonetization]
3,@ANI_news: Gurugram (Haryana): Post office emp...,[#demonetization]
4,@satishacharya: Reddy Wedding! @mail_today car...,"[#demonetization, #ReddyWedding]"


In [15]:
df['hashtags'].value_counts().head(50)

[]                                                                      5251
[#demonetization]                                                       2882
[#Demonetization]                                                       2488
[#DeMonetization]                                                        808
[#nitishkumar, #Demonetization]                                          257
[#India, #demo]                                                          173
[#GLvMI]                                                                 145
[#CorruptionFreeIndia, #Demonetization]                                  103
[#VijayMallya]                                                            86
[#Demonetization, #NoMoneyYaar]                                           53
[#Demonetization, #Demonetizat]                                           47
[#SonuNigam]                                                              47
[#Demonetization, #Insights]                                              44

#  3. Find the sentence having the mations of pm

In [16]:
# importing spacy and its matcher function
import spacy
from spacy.matcher import Matcher

# load english language model
nlp = spacy.load('en_core_web_sm',disable=['ner','textcat'])

In [17]:
# function to find sentences mentioning president
def find_names(text):

    names = []

    # spacy doc
    doc = nlp(text)

    # pattern
    pattern = [{'LOWER': 'modi'}]


    # Matcher class object
    matcher = Matcher(nlp.vocab)
    matcher.add("names",[pattern])

    matches = matcher(doc)

    # finding patterns in the text
    for i in range(0,len(matches)):

        # match: id, start, end
        token = doc[matches[i][1]:matches[i][2]]
        # append token to list
        names.append(str(token))

    return names

In [18]:
# extracting sentences with president's mentions
extracted_sentences = []
for i in range(df.shape[0]):
    extracted_sentences.append(find_names(df['text'][i]))

# storing the extracted sentences in the dataframe
df['Prime_minister'] = extracted_sentences

In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,hashtags,Prime_minister
0,1,@rssurjewala: Critical question: Was PayTM inf...,[#Demonetization],[]
1,2,@Hemant_80: Did you vote on #Demonetization on...,[#Demonetization],[Modi]
2,3,"@roshankar: Former FinSec, RBI Dy Governor, CB...",[#Demonetization],[]
3,4,@ANI_news: Gurugram (Haryana): Post office emp...,[#demonetization],[]
4,5,@satishacharya: Reddy Wedding! @mail_today car...,"[#demonetization, #ReddyWedding]",[]


In [20]:
mention = []
for i in range(df.shape[0]):
    if df['Prime_minister'][i] != []:
        mention.append(df['Prime_minister'][i])

mention

[['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi', 'Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 ['Modi'],
 [

In [21]:
sentences_pm = []
for i in range(df.shape[0]):
    if df['Prime_minister'][i] != []:
        sentences_pm.append(df['text'][i])

sentences_pm

['@Hemant_80: Did you vote on #Demonetization on Modi survey app?',
 '@Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r',
 '@Dipankar_cpiml: The Modi app on #DeMonetization proves once again that the govt is totally indifferent to the mounting misery and hards\x85',
 '@Hemant_80: Did you vote on #Demonetization on Modi survey app?',
 '@Hemant_80: Did you vote on #Demonetization on Modi survey app?',
 '@Hemant_80: Did you vote on #Demonetization on Modi survey app?',
 '@kanimozhi: Ts is exactly what Pappu &amp; opposition has done to themselves by opposing #Demonetization Now none can stop Modi bandwagon ti\x85',
 '@pGurus1: #Demonetization move of Modi; Who is supporting it, who is opposing it and why? A complete coverage of all parties - https://t\x85',
 '@Joydas: Question in Narendra Modi App where PM is taking feedback if people support his #DeMonetization strategy https://t.co/pYgK8Rmg7r',
 '@ka

In [22]:
sentences_pm[100]

'@Hemant_80: Did you vote on #Demonetization on Modi survey app?'

# 2. Use the prepositions to extract relevant information from the tweets

In [23]:

def rule3(text):

    doc = nlp(text)

    sent = []

    for token in doc:

        
        if (token.pos_=='VERB'):

            phrase =''

            
            for sub_tok in token.lefts:

                if (sub_tok.dep_ in ['nsubj','nsubjpass']) and (sub_tok.pos_ in ['NOUN','PROPN','PRON']):

                    
                    phrase += sub_tok.text

                    
                    phrase += ' '+token.lemma_

                    
                    for sub_tok in token.rights:

                        
                        if (sub_tok.dep_ in ['dobj']) and (sub_tok.pos_ in ['NOUN','PROPN']):

                            phrase += ' '+sub_tok.text
                            sent.append(phrase)

    return sent

In [28]:

row_list = []

for i in range(len(df)):

    sent = df.loc[i,'text']
    output = rule3(sent)
    dict1 = {'text':sent,'Output':output}
    row_list.append(dict1)

df_rule3 = pd.DataFrame(row_list)

In [29]:
df_rule3

Unnamed: 0,text,Output
0,@rssurjewala: Critical question: Was PayTM inf...,[PayTM inform edict]
1,@Hemant_80: Did you vote on #Demonetization on...,[]
2,"@roshankar: Former FinSec, RBI Dy Governor, CB...",[]
3,@ANI_news: Gurugram (Haryana): Post office emp...,[employees provide exchange]
4,@satishacharya: Reddy Wedding! @mail_today car...,[]
...,...,...
14935,@saxenavishakha: Ghost of demonetization retur...,[]
14936,N d modi fans-d true nationalists of the count...,"[modi think https://t.co/9mgMEFu2sl, modi thin..."
14937,@bharat_builder: Lol. Demonetization has fixed...,"[Demonetization fix lot, pelters join crunch]"
14938,@Stupidosaur: @Vidyut B team of BJP. CIA baby....,[CCTV support strikes]


In [35]:
df_show = pd.DataFrame(columns=df_rule3.columns)

for row in range(len(df_rule3)):

    if len(df_rule3.loc[row,'Output'])!=0:
        df_show = df_show.append(df_rule3.loc[row,:])


df_show.reset_index(inplace=True, drop=True)

In [36]:
df_show.head()

Unnamed: 0,text,Output
0,@rssurjewala: Critical question: Was PayTM inf...,[PayTM inform edict]
1,@ANI_news: Gurugram (Haryana): Post office emp...,[employees provide exchange]
2,National reform now destroyed even the essence...,[reform destroy essence]
3,@Joydas: Question in Narendra Modi App where P...,"[PM take feedback, people support strategy]"
4,@Jaggesh2 Bharat band on 28??Those who are pr...,[who protest demonetization]


In [38]:
prep_dict = dict()
dis_dict = dict()
dis_list = []


for i in range(len(df_show)):

    
    sentence = df_show.loc[i,'text']
   
    
    output = df_show.loc[i,'Output']


    for sent in output:

        
        n1, p, n2 = sent.split()[0], sent.split()[1], sent.split()[2:]

        
        dis_dict = {'text':text,'Noun1':n1,'Preposition':p,'Noun2':n2}
        dis_list.append(dis_dict)

        
        prep = sent.split()[1]
        if prep in prep_dict:
            prep_dict[prep]+=1
        else:
            prep_dict[prep]=1

df_sep= pd.DataFrame(dis_list)

In [39]:
df_sep.head()

Unnamed: 0,text,Noun1,Preposition,Noun2
0,RT @rssurjewala: Critical question: Was PayTM ...,PayTM,inform,[edict]
1,RT @rssurjewala: Critical question: Was PayTM ...,employees,provide,[exchange]
2,RT @rssurjewala: Critical question: Was PayTM ...,reform,destroy,[essence]
3,RT @rssurjewala: Critical question: Was PayTM ...,PM,take,[feedback]
4,RT @rssurjewala: Critical question: Was PayTM ...,people,support,[strategy]


In [40]:
df_sep['Preposition'].value_counts()[:10]

@narendramodi    997
support          610
have             451
inform           288
get              265
take             239
claim            187
end              157
make             156
give             151
Name: Preposition, dtype: int64

In [42]:
df_sep[df_sep['Preposition'] == 'support'].head(10)

Unnamed: 0,text,Noun1,Preposition,Noun2
4,RT @rssurjewala: Critical question: Was PayTM ...,people,support,[strategy]
20,RT @rssurjewala: Critical question: Was PayTM ...,%,support,[ban]
22,RT @rssurjewala: Critical question: Was PayTM ...,%,support,[ban]
34,RT @rssurjewala: Critical question: Was PayTM ...,people,support,[strategy]
45,RT @rssurjewala: Critical question: Was PayTM ...,%,support,[ban]
60,RT @rssurjewala: Critical question: Was PayTM ...,nitishkumar,support,[@narendramodi]
65,RT @rssurjewala: Critical question: Was PayTM ...,%,support,[ban]
82,RT @rssurjewala: Critical question: Was PayTM ...,%,support,[ban]
106,RT @rssurjewala: Critical question: Was PayTM ...,%,support,[ban]
114,RT @rssurjewala: Critical question: Was PayTM ...,%,support,[ban]
