In [86]:
import re
import pandas as pd
import numpy as np
import spacy

In [87]:
def clean(text): 
    text = re.sub('[0-9]+.\t' + '...','',str(text))
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    text = re.sub("'s",'',str(text))
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    text = re.sub('\"','',str(text))
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))
    return text

In [88]:
def sentences(text):
    text = re.split('[.?!]', text)
    clean_sent = []
    for sent in text:
        clean_sent.append(sent)
    return clean_sent

In [89]:
# Dataframe
df = pd.read_csv(r'.\datasets_AI\news.csv')
df = df.drop(columns=['url', 'hostname', 'timestamp'], axis=1)
df = df[df.story == 'dABGVITQs6X1I4MdYGnX9zY59PpVM']
# clean speech
df['Speech_clean'] = df['main_content'].apply(clean)

# seperate sentences
df['sent'] = df['Speech_clean'].apply(sentences)

#reset indices for removed stuff
df.reset_index(inplace=True)
df.drop(['Unnamed: 0', 'index'], axis = 1, inplace = True)

df.head()

Unnamed: 0,id,title,publisher,category,story,main_content,main_content_len,Speech_clean,sent
0,22237,"The Incredibles 2, Cars 3 in the works, Disney...",Digital Spy,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,The Incredibles 2 and Cars 3 are in developmen...,1340.0,The Incredibles 2 and Cars 3 are in developmen...,[The Incredibles 2 and Cars 3 are in developme...
1,22241,The Incredibles are set for another big-screen...,Belfast Telegraph,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,"Incredibles 2, Cars 3 in the works BelfastTele...",1620.0,"Incredibles 2, Cars 3 in the works BelfastTele...","[Incredibles 2, Cars 3 in the works BelfastTel..."
2,22244,Pixar Working On Sequels For Popular Animated ...,Online News Heard Now,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,Posted by News\n\nPixar Working On Sequels For...,1339.0,Posted by News Pixar Working On Sequels For P...,[Posted by News Pixar Working On Sequels For ...
3,22248,"State Of The (Disney) Union: Cars 3, Incredibl...",Contactmusic.com,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,In news you didn’t know you needed until right...,1737.0,In news you didn’t know you needed until right...,[In news you didn’t know you needed until righ...
4,22249,Disney Pixar confirm The Incredibles 2,Total Film,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,The first footage from Incredibles 2 (there's ...,2942.0,The first footage from Incredibles 2 was show...,[The first footage from Incredibles 2 was sho...


In [90]:
df2 = pd.DataFrame(columns=['sent','id','len'])

row_list = []

for i in range(len(df)):
    for sent in df.at[i,'sent']:
    
        wordcount = len(sent.split())
        id = df.at[i,'id']

        dict1 = {'id' : id, 'sent' : sent, 'len' : wordcount}
        row_list.append(dict1)
    

df2 = pd.DataFrame(row_list)
df2.head()

Unnamed: 0,id,sent,len
0,22237,The Incredibles 2 and Cars 3 are in developmen...,12
1,22237,The movie sequels were confirmed by Disney P...,11
2,22237,Advertisement Continue Reading Below Ig...,30
3,22237,Pixar is also currently working on Finding N...,17
4,22237,"Meanwhile, yesterday it was reported that Pi...",15


In [91]:
nlp = spacy.load('en_core_web_md')

In [92]:
p = df2['sent'].tolist()
docx = nlp(p[np.random.randint(1, len(df2))])
docy = nlp(p[np.random.randint(1, len(df2))])
x = docx.similarity(docy)

[(docx, docy), x]

[( The film performed very well at the box office, grossing $631 million worldwide during its original theatrical run,
   Brad Bird has remained open to the possibility of directing a follow up, but has said that he’d only do it if he could honor the original),
 0.834841800489626]

In [93]:
def dictfy(d1, t1):
    sendict = dict()
    for key in t1:
        sendict[key] = []
        for word in d1:
            if word.dep_ == key:
                sendict[key].append(str(word).lower())
    return sendict


In [94]:
def filter(x1):
    pq = list(x1.keys())
    for key in pq:
        if ('obj' not in key) and ('subj' not in key):
            x1.pop(key)

In [95]:
if x > 0.825:
    d1 = dictfy(docx, set([token.dep_ for token in docx]))
    d2 = dictfy(docy, set([token.dep_ for token in docy]))
    print(d1, d2)
else:
    print([(docx, docy), x], "sentences not similar enough")

{'dep': [' '], 'pobj': ['office', 'run'], 'advcl': ['grossing'], 'poss': ['its'], 'dobj': ['million'], 'advmod': ['very', 'well', 'worldwide'], 'compound': ['box', '631'], 'amod': ['original', 'theatrical'], 'det': ['the', 'the'], 'quantmod': ['$'], 'prep': ['at', 'during'], 'nsubj': ['film'], 'punct': [','], 'ROOT': ['performed']} {'cc': ['but'], 'pobj': ['possibility'], 'dobj': ['follow', 'it', 'original'], 'compound': ['brad'], 'nsubj': ['bird', 'he', 'he'], 'ROOT': ['remained'], 'dep': [' '], 'conj': ['said'], 'ccomp': ['do'], 'punct': [','], 'prt': ['up'], 'advcl': ['honor'], 'advmod': ['only'], 'pcomp': ['directing'], 'det': ['the', 'a', 'the'], 'acomp': ['open'], 'aux': ['has', 'has', '’d', 'could'], 'mark': ['that', 'if'], 'prep': ['to', 'of']}


In [96]:
filter(d1)
filter(d2)
print(d1, d2)

{'pobj': ['office', 'run'], 'dobj': ['million'], 'nsubj': ['film']} {'pobj': ['possibility'], 'dobj': ['follow', 'it', 'original'], 'nsubj': ['bird', 'he', 'he']}
