In [1]:
import re
import pandas as pd
import numpy as np
import spacy

In [2]:
def clean(text): 
    text = re.sub('[0-9]+.\t' + '...','',str(text))
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    text = re.sub("'s",'',str(text))
    text = re.sub("-",' ',str(text))
    text = re.sub("—",'',str(text))
    text = re.sub('\"','',str(text))
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    return [text]

In [4]:
# Dataframe
df = pd.read_csv(r'.\datasets_AI\news.csv')
df = df.drop(columns=['url', 'hostname', 'timestamp'], axis=1)
df = df[df.story == 'dABGVITQs6X1I4MdYGnX9zY59PpVM']
# clean speech
df['Speech_clean'] = df['main_content'].apply(clean)

#reset indices for removed stuff
df.reset_index(inplace=True)
df.drop(['Unnamed: 0', 'index'], axis = 1, inplace = True)

df.head()

Unnamed: 0,id,title,publisher,category,story,main_content,main_content_len,Speech_clean,sent
0,22237,"The Incredibles 2, Cars 3 in the works, Disney...",Digital Spy,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,The Incredibles 2 and Cars 3 are in developmen...,1340.0,The Incredibles 2 and Cars 3 are in developmen...,[The Incredibles 2 and Cars 3 are in developme...
1,22241,The Incredibles are set for another big-screen...,Belfast Telegraph,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,"Incredibles 2, Cars 3 in the works BelfastTele...",1620.0,"Incredibles 2, Cars 3 in the works BelfastTele...","[Incredibles 2, Cars 3 in the works BelfastTel..."
2,22244,Pixar Working On Sequels For Popular Animated ...,Online News Heard Now,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,Posted by News\n\nPixar Working On Sequels For...,1339.0,Posted by News Pixar Working On Sequels For P...,[Posted by News Pixar Working On Sequels For ...
3,22248,"State Of The (Disney) Union: Cars 3, Incredibl...",Contactmusic.com,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,In news you didn’t know you needed until right...,1737.0,In news you didn’t know you needed until right...,[In news you didn’t know you needed until righ...
4,22249,Disney Pixar confirm The Incredibles 2,Total Film,e,dABGVITQs6X1I4MdYGnX9zY59PpVM,The first footage from Incredibles 2 (there's ...,2942.0,The first footage from Incredibles 2 was show...,[The first footage from Incredibles 2 was sho...


In [5]:
df2 = pd.DataFrame(columns=['sent','id','len'])

row_list = []

for i in range(len(df)):
    for sent in df.at[i,'sent']:
    
        wordcount = len(sent.split())
        id = df.at[i,'id']

        dict1 = {'id' : id, 'sent' : sent, 'len' : wordcount}
        row_list.append(dict1)
    

df2 = pd.DataFrame(row_list)
df2.head()

Unnamed: 0,id,sent,len
0,22237,The Incredibles 2 and Cars 3 are in developmen...,224
1,22241,"Incredibles 2, Cars 3 in the works BelfastTele...",239
2,22244,Posted by News Pixar Working On Sequels For P...,227
3,22248,In news you didn’t know you needed until right...,306
4,22249,The first footage from Incredibles 2 was show...,503


In [6]:
nlp = spacy.load('en_core_web_md')

In [7]:
p = df2['sent'].tolist()
docx = nlp(p[np.random.randint(1, len(df2))])
docy = nlp(p[np.random.randint(1, len(df2))])
x = docx.similarity(docy)

[(docx, docy), x]

[(This is going to make a lot of people very happy. One of Pixar’s finest is getting a new sequel.  That’s right. Cars 3 is on the way. The studio will also be tossing out Incredibles 2. Don’t know who’s going to care too much about that, except toy store owners.  Okay, maybe I’ve gotten my wires crossed.  In all honesty, I’m excited about any new Pixar film and I’m not going to dismiss a Cars 3 out of hand. It’s not like I don’t enjoy the first two, either. As much as The Incredibles? Okay, no. But enough.  Robert Iger gave word of both sequels during an investor’s conference call this afternoon. He didn’t say who would be directing either picture, but a lot of folk will be assuming Brad Bird will be back for the Incredibles sequel.  Or maybe it will be Teddy Newton…  UPDATE: Iger also said that Brad Bird is working on the film’s story now. I guess it is likely to be him directing, then…?  Anyway, official word of the creatives involved will be along soon, I hope. I’d also expect some

In [8]:
def dictfy(d1, t1):
    sendict = dict()
    for key in t1:
        sendict[key] = []
        for word in d1:
            if word.label_ == key:
                sendict[key].append(str(word).lower().strip())
    for key in sendict.keys():
        sendict[key] = list(set(sendict[key]))
    return sendict


In [9]:
if x > 0.925:
    x1 = dictfy([ent for ent in docx.ents], [ent.label_ for ent in docx.ents])
    xres = filter(lambda x: x.tag_ == 'VBG', docx)
    x1['VERB'] = list(set(xres))
    y1 = dictfy([ent for ent in docy.ents], [ent.label_ for ent in docy.ents])
    yres = filter(lambda y: y.tag_ == 'VBG', docy)
    y1['VERB'] = list(set(yres))
    print([x1, y1], x)
else:
    print([(docx, docy), x], "sentences not similar enough")

# [[ent.text, ent.label_] for ent in docy.ents]

[{'CARDINAL': ['two', 'one'], 'ORG': ['pixar', 'incredibles'], 'PRODUCT': ['incredibles 2'], 'ORDINAL': ['first'], 'PERSON': ['iger', 'robert iger', 'brad bird', 'teddy newton'], 'TIME': ['this afternoon'], 'VERB': [assuming, getting, going, going, tossing, directing, going, directing, working]}, {'DATE': ['today', 'ten years ago', 'season 1'], 'ORG': ['pixar', 'cars', 'tomorrowland', 'underminer', 'incredibles', 'disney animation'], 'PERSON': ['john lasseter', 'pixar', 'bob iger', 'jack jack', 'buffy binge', 'holly hunter', 'brad bird', 'craig t nelson', 'sarah vowell', 'jason lee', 'samuel l. jackson', 'edna'], 'ORDINAL': ['first'], 'PRODUCT': ['cars 2'], 'CARDINAL': ['billions'], 'VERB': [bringing, assuming, bringing]}] 0.9935369179431389
