## Import data, inspect, drop NaN rows

In [1]:
import pandas as pd

In [2]:
lines = pd.read_csv('simpsons_dataset.csv')
lines.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
lines.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [6]:
lines.shape

(158314, 2)

In [7]:
lines = lines.dropna()
lines.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [8]:
lines.shape

(131853, 2)

## Import spacy, create a list of docs and a list of their vectors

In [9]:
import spacy

In [10]:
docs = []
nlp = spacy.load('en_core_web_lg')

In [11]:
count = 0
for doc in nlp.pipe(lines['spoken_words'].values):
    docs.append(doc)

In [12]:
vectors = [doc.vector for doc in docs]

### Add vectors to lines df

In [13]:
lines['vectors'] = vectors

## Initialize a KNN model and fit on the vectors

In [14]:
from sklearn.neighbors import NearestNeighbors

In [15]:
nn  = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(vectors)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

## Functions to get/find vectors

In [16]:
def get_vectors_of_string(inp_str):
    return nlp(inp_str).vector

In [17]:
def find_vector(vect):
    return lines_df.apply(lambda x: x if x['vectors'] == vect else '', axis=1)

In [18]:
def find_quotes(inp_str):
    vect = get_vectors_of_string(inp_str)
    closest_quotes = nn.kneighbors([vect])
    return lines.iloc[closest_quotes[1][0]]

In [25]:
find_quotes("matter of fact, they're all in the hammock complex")

Unnamed: 0,raw_character_text,spoken_words,vectors
34275,Hank Scorpio,"That might... Matter of fact, they're all in t...","[-0.0005717829, 0.19351904, -0.08779617, -0.11..."
146256,Lisa Simpson,"Well, I guess we've learned that of all the co...","[0.012387499, 0.08285451, -0.12590693, -0.1315..."
150012,Dr. Marvin Monroe,"Everyone comfy? Good. Now, don't touch any of ...","[0.0024357901, 0.19450384, -0.22521602, -0.034..."
131254,Lisa Simpson,"Well sure life is full of pain and drudgery, b...","[0.04056196, 0.1639796, -0.13865817, -0.102411..."
146613,Lisa Simpson,"That's so sad. Maybe we're the same, two lost ...","[0.055236608, 0.12521951, -0.11474382, -0.0670..."


In [24]:
lines.loc[34275]['spoken_words']

"That might... Matter of fact, they're all in the same complex. It's the Hammock Complex, down on Third?"