In [379]:
import re
import pandas as pd
import numpy as np
import spacy

In [380]:
def clean(text):
    
    text = re.sub('[0-9]+.\t','',str(text))
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    text = re.sub("'s",'',str(text))
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    text = re.sub('\"','',str(text))
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))
    
    return text

In [381]:
def sentences(text):
    text = re.split('[.?!]', text)
    clean_sent = []
    for sent in text:
        clean_sent.append(sent)
    return clean_sent

In [382]:
# Dataframe
df = pd.read_csv('.\datasets_AI\PM_Modi_speeches.csv')
df = df.drop(columns=['url'], axis=1)
df = df[df.lang == 'en']

# clean speech
df['Speech_clean'] = df['text'].apply(clean)

# seperate sentences
df['sent'] = df['Speech_clean'].apply(sentences)

#reset indices for removed stuff
df.reset_index(inplace=True)
df.drop(['index'], axis = 1, inplace = True)

df.head()

Unnamed: 0,date,title,lang,words,text,Speech_clean,sent
0,"Aug 30, 2020",PM’s address in the 15th Episode of ‘Mann Ki B...,en,21619,"My dear countrymen, Namaskar.\nGenerally, this...","My dear countrymen, Namaskar. Generally, this ...","[My dear countrymen, Namaskar, Generally, thi..."
1,"Aug 29, 2020",PM’s address at inauguration of the College an...,en,10128,Our country’s Agriculture Minister Shri Narend...,Our country’s Agriculture Minister Shri Narend...,[Our country’s Agriculture Minister Shri Naren...
2,"Aug 27, 2020",PM’s address at seminar on Atmanirbhar Bharat ...,en,8497,"My cabinet colleague, Shri Rajnath ji, Chief o...","My cabinet colleague, Shri Rajnath ji, Chief o...","[My cabinet colleague, Shri Rajnath ji, Chief ..."
3,"Aug 15, 2020",PM’s address to the Nation from the ramparts o...,en,50260,"My dear countrymen,\nCongratulations and many ...","My dear countrymen, Congratulations and many b...","[My dear countrymen, Congratulations and many ..."
4,"Aug 13, 2020",PM’s address at the Launch of ‘Transparent Tax...,en,11908,The process of Structural Reforms going on in ...,The process of Structural Reforms going on in ...,[The process of Structural Reforms going on in...


In [383]:
df2 = pd.DataFrame(columns=['sent','date','len'])

row_list = []

for i in range(len(df)):
    for sent in df.at[i,'sent']:
    
        wordcount = len(sent.split())
        date = df.at[i,'date']

        dict1 = {'date' : date, 'sent' : sent, 'len' : wordcount}
        row_list.append(dict1)
    

df2 = pd.DataFrame(row_list)
df2.head()

Unnamed: 0,date,sent,len
0,"Aug 30, 2020","My dear countrymen, Namaskar",4
1,"Aug 30, 2020","Generally, this period is full of festivals; ...",20
2,"Aug 30, 2020","During these times of Corona crises, on the o...",31
3,"Aug 30, 2020","Broadly speaking in a way, there is a feeling...",13
4,"Aug 30, 2020",People are getting along with their day to da...,19


In [384]:
nlp = spacy.load('en_core_web_md')

In [385]:
p = df2['sent'].tolist()
docx = nlp(p[np.random.randint(1, len(df2))])
docy = nlp(p[np.random.randint(1, len(df2))])
x = docx.similarity(docy)

[(docx, docy), x]

[( I propose that a mock Parliament be organized around the 15th August in Delhi comprising one young representatives selected from every district of India who would participate and deliberate on how a new India could be formed in the next five years,
   Similarly, India’s ancient heritage of yoga stands for holistic living that is in tune with nature),
 0.9332822639247893]

In [386]:
def dictfy(d1, t1):
    sendict = dict()
    for key in t1:
        sendict[key] = []
        for word in d1:
            if word.dep_ == key:
                sendict[key].append(word)
    return sendict


In [407]:
def filter(x1):
    pq = list(x1.keys())
    for key in pq:
        if ('obj' not in key) and ('subj' not in key):
            x1.pop(key)

In [408]:
if x > 0.925:
    d1 = dictfy(docx, set([token.dep_ for token in docx]))
    d2 = dictfy(docy, set([token.dep_ for token in docy]))
    print(d1, d2)
else:
    print([(docx, docy), x], "sentences not similar enough")

{'aux': [would, could], 'nsubjpass': [Parliament, India], 'amod': [mock, 15th, young, new, next], 'det': [a, the, every, a, the], 'prep': [around, in, from, of, on, in], 'appos': [August], 'pobj': [district, India, years], 'ccomp': [organized], 'dep': [ ,  ], 'nummod': [one, five], 'conj': [deliberate], 'advmod': [how], 'acl': [selected], 'dobj': [representatives], 'relcl': [participate], 'mark': [that], 'auxpass': [be, be], 'nsubj': [I, Delhi, who], 'pcomp': [comprising, formed], 'ROOT': [propose], 'cc': [and]} {'relcl': [is], 'amod': [ancient, holistic], 'pobj': [yoga, living, tune, nature], 'punct': [,], 'nsubj': [heritage, that], 'dep': [ ], 'poss': [India], 'advmod': [Similarly], 'ROOT': [stands], 'prep': [of, for, in, with], 'case': [’s]}


In [409]:
filter(d1)
filter(d2)
print(d1, d2)

{'nsubjpass': [Parliament, India], 'pobj': [district, India, years], 'dobj': [representatives], 'nsubj': [I, Delhi, who]} {'pobj': [yoga, living, tune, nature], 'nsubj': [heritage, that]}
