## Import data, inspect, drop NaN rows

In [1]:
import pandas as pd

In [2]:
lines = pd.read_csv('simpsons_dataset.csv')
lines.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
lines.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [6]:
lines.shape

(158314, 2)

In [7]:
lines = lines.dropna()
lines.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [8]:
lines.shape

(131853, 2)

## Import spacy, create a list of docs and a list of their vectors

In [9]:
import spacy

In [10]:
docs = []
nlp = spacy.load('en_core_web_lg')

In [11]:
count = 0
for doc in nlp.pipe(lines['spoken_words'].values):
    docs.append(doc)

In [12]:
vectors = [doc.vector for doc in docs]

### Add vectors to lines df

In [13]:
lines['vectors'] = vectors

## Initialize a KNN model and fit on the vectors

In [14]:
from sklearn.neighbors import NearestNeighbors

In [15]:
nn  = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(vectors)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

## Functions to get vectors/find quotes

In [16]:
def get_vectors_of_string(inp_str):
    return nlp(inp_str).vector

In [18]:
def find_quotes(inp_str):
    vect = get_vectors_of_string(inp_str)
    closest_quotes = nn.kneighbors([vect])
    return lines.iloc[closest_quotes[1][0]]

## Check if the functions are cromulent

In [25]:
find_quotes("matter of fact, they're all in the hammock complex")

Unnamed: 0,raw_character_text,spoken_words,vectors
34275,Hank Scorpio,"That might... Matter of fact, they're all in t...","[-0.0005717829, 0.19351904, -0.08779617, -0.11..."
146256,Lisa Simpson,"Well, I guess we've learned that of all the co...","[0.012387499, 0.08285451, -0.12590693, -0.1315..."
150012,Dr. Marvin Monroe,"Everyone comfy? Good. Now, don't touch any of ...","[0.0024357901, 0.19450384, -0.22521602, -0.034..."
131254,Lisa Simpson,"Well sure life is full of pain and drudgery, b...","[0.04056196, 0.1639796, -0.13865817, -0.102411..."
146613,Lisa Simpson,"That's so sad. Maybe we're the same, two lost ...","[0.055236608, 0.12521951, -0.11474382, -0.0670..."


In [24]:
lines.loc[34275]['spoken_words']

"That might... Matter of fact, they're all in the same complex. It's the Hammock Complex, down on Third?"

In [29]:
find_quotes("win friends with salad")

Unnamed: 0,raw_character_text,spoken_words,vectors
136565,Lisa Simpson,It came with fries or salad.,"[-0.16468573, 0.16284688, 0.08980615, -0.17369..."
46218,Kent Brockman,"Good luck, kids. Where the hell's my grilled c...","[-0.119722456, 0.16942002, -0.061315738, -0.09..."
82906,ROBOT WARS ANNOUNCER #2,Congratulations to our winning father-and-son ...,"[-0.0970903, 0.18714643, 0.014011727, 0.041308..."
73692,Homer Simpson,"Certainly, Lenford. Make every day a celebrati...","[-0.058565103, 0.15718296, -0.046536416, 0.017..."
132356,Unidentified Baldwin,You guys suck at acting and you ate all the po...,"[-0.2541025, 0.03132107, -0.11368286, -0.12731..."


In [27]:
find_quotes("steamed hams")

Unnamed: 0,raw_character_text,spoken_words,vectors
153503,Homer Simpson,Smothered pork chops.,"[-0.38019222, -0.1277625, 0.34242252, 0.077048..."
49564,Homer Simpson,Steamed Maine cabbages!,"[-0.277914, -0.041433744, -0.075511254, -0.372..."
32674,Gary Chalmers,"For ""steamed hams.""","[-0.20388983, 0.04237333, -0.18209751, -0.1394..."
32664,Gary Chalmers,"You call hamburgers ""steamed hams?""","[-0.22379924, 0.09646686, -0.088719375, 0.0168..."
110504,Moe Szyslak,Ham sandwiches!,"[-0.33784032, -0.18016668, 0.46686664, 0.22452..."


In [30]:
find_quotes("maybe lisa is right that America is the land of opportunity")

Unnamed: 0,raw_character_text,spoken_words,vectors
152056,Homer Simpson,"Please, please kids. Stop fighting. Maybe Lisa...","[-0.09950939, 0.18024185, -0.12073283, -0.0673..."
3707,Dave,"Aw, that's the spirit, Mr. Simpson. Now step o...","[-7.059774e-05, 0.207665, -0.13998467, -0.0446..."
15742,Lizzie Borden,"We've heard enough. Your honor, we find that H...","[-0.01593856, 0.16347897, -0.12644829, -0.1271..."
156711,Seymour Skinner,I think the real question is who is this and w...,"[-0.03951687, 0.26348725, -0.1547953, -0.09261..."
131586,Lisa Simpson,"Dad, the story of the passion is the cornersto...","[-0.022304649, 0.15845042, -0.12650898, -0.110..."


In [31]:
lines.loc[152056]['spoken_words']

"Please, please kids. Stop fighting. Maybe Lisa's right about America being a land of opportunity, and maybe Adil has a point about the machinery of capitalism being oiled with the blood of the workers."