# Load data

In [1]:
import pickle
import numpy as np
import random

In [2]:
#Import actor/movie data
with open('./data/celebs.pickle', 'rb') as handle:
    actor_list = pickle.load(handle, encoding='UTF-8')
with open('./data/movies.pickle', 'rb') as handle:
    movie_list = pickle.load(handle, encoding='UTF-8')
with open('./data/characters.pickle', 'rb') as handle:
    character_dict = pickle.load(handle, encoding='UTF-8')

In [3]:
print(len(actor_list))
random.sample(actor_list, 10)

116542


['Lex Milczarek',
 'Tim Hands',
 'Christina Tchernychova',
 'Rie Hayasaka',
 'Eddie B. Smith',
 'Alex Twigg',
 'Montassar Ayari',
 'Wei Tung',
 'Myquan Jackson',
 'Giedre Gudeikiene']

In [4]:
print(len(movie_list))
random.sample(movie_list, 10)

15073


['Just Like a Woman',
 'Zero Motivation',
 'Bite Marks',
 'Sausage Party',
 'No quiero dormir sola',
 'Alle Anderen',
 'Baby Shower',
 'Bride Wars',
 'Ghost in the Machine',
 'A Reckoning']

In [5]:
print(len(character_dict))
character_dict['Bruce Wayne']

118027


['Christian Bale', 'Nathan Bonk']

# Create master list of entities

In [6]:
#Instantiate vars, transform count vectorizer
character_list = list(character_dict.keys())
all_entities = actor_list + movie_list + character_list

actors_len = len(actor_list)
movies_len = len(movie_list)
character_len = len(character_list)

actors_len + movies_len + character_len #Order matters!

249642

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3)) #All 1-3 character sequences

In [9]:
%%time

vec_transformed = vectorizer.fit_transform(all_entities) #Keep track of transformed object

CPU times: user 4.7 s, sys: 169 ms, total: 4.87 s
Wall time: 4.94 s


In [10]:
vec_transformed #Sparse matrix?

<249642x27931 sparse matrix of type '<class 'numpy.int64'>'
	with 8893321 stored elements in Compressed Sparse Row format>

In [11]:
print(len(vectorizer.vocabulary_))
random.sample(vectorizer.vocabulary_, 10)

27931


{"'": 1702,
 'b': 5970,
 'i': 11336,
 'g': 9813,
 ' ': 0,
 'o': 16594,
 'r': 18498,
 'e': 8121,
 't': 20644,
 'm': 14723,
 'c': 6619,
 'h': 10571,
 'l': 13708,
 "'b": 1809,
 'bi': 6220,
 'ig': 11690,
 'g ': 9814,
 ' b': 375,
 'bo': 6313,
 'oo': 17208,
 "o'": 16650,
 "' ": 1703,
 'br': 6356,
 're': 18791,
 'et': 8956,
 'tt': 21210,
 't ': 20645,
 ' m': 792,
 'mi': 15050,
 'it': 12158,
 'tc': 20850,
 'ch': 6862,
 'he': 10759,
 'el': 8650,
 'll': 14168,
 "'bi": 1815,
 'big': 6234,
 'ig ': 11691,
 'g b': 9831,
 ' bo': 394,
 'boo': 6332,
 "oo'": 17212,
 "o' ": 16651,
 "' b": 1710,
 ' br': 395,
 'bre': 6358,
 'ret': 18823,
 'ett': 8983,
 'tt ': 21211,
 't m': 20676,
 ' mi': 806,
 'mit': 15078,
 'itc': 12168,
 'tch': 20853,
 'che': 6876,
 'hel': 10780,
 'ell': 8671,
 'f': 9245,
 'u': 21543,
 'a': 4719,
 'p': 17682,
 'j': 12445,
 'n': 15560,
 's': 19681,
 "'f": 1871,
 'fo': 9554,
 'ou': 17423,
 'ul': 21989,
 'l ': 13709,
 'ba': 6070,
 'al': 5305,
 "l'": 13770,
 ' p': 917,
 'pa': 17793,
 'au': 

# Cosine similarity function

In [12]:
user_input = "Morgin Freiman"

In [13]:
user_input_transformed = vectorizer.transform([user_input])

In [14]:
dense = user_input_transformed.toarray()
print(len(dense[0]), "items in Morgin Freiman vector")
[t for t in dense[0]][:20]

27931 items in Morgin Freiman vector


[1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [15]:
#See all query tokens
[i for i in vectorizer.inverse_transform(dense)[0]]

[' ',
 ' f',
 ' fr',
 'a',
 'an',
 'e',
 'ei',
 'eim',
 'f',
 'fr',
 'fre',
 'g',
 'gi',
 'gin',
 'i',
 'im',
 'ima',
 'in',
 'in ',
 'm',
 'ma',
 'man',
 'mo',
 'mor',
 'n',
 'n ',
 'n f',
 'o',
 'or',
 'org',
 'r',
 're',
 'rei',
 'rg',
 'rgi']

In [16]:
#Search for index of a token
search_token = 'mor'
for token, i in vectorizer.vocabulary_.items():
    if token == search_token:
        print(i)

15192


In [17]:
from sklearn.metrics import pairwise_kernels

In [20]:
#Perform cosine similarity calculation between user input and all entities
sim_scores = pairwise_kernels(
                 vec_transformed, # Big sparse matrix of all entities
                 user_input_transformed, # User's query
                 metric='cosine').flatten().tolist()

In [21]:
sim_scores[:10] #What's this?

[0.24921971504234075,
 0.16461975596431738,
 0.3031746915839468,
 0.19849711139180454,
 0.3536514954068699,
 0.25264557631995566,
 0.17542826450672636,
 0.2205271385679885,
 0.2887979489524622,
 0.4168439339227897,
 0.22281245492773064,
 0.14303239192265577,
 0.1375228328345058,
 0.14303239192265577,
 0.21895949947729493,
 0.19849711139180454,
 0.37921028494152054,
 0.17864740025262407,
 0.141509827535445,
 0.2700894819638062,
 0.2366242621401505,
 0.1403586535110865,
 0.2681421639778321,
 0.2225061833672521,
 0.27565892320998564,
 0.24655683636076894,
 0.1678603981989089,
 0.20756952180649915,
 0.18960514247076027,
 0.17795362132005163,
 0.1894841822399668,
 0.07396705090823068,
 0.11026356928399425,
 0.05721295676906231,
 0.25015639663712996,
 0.2609312292213769,
 0.12507819831856498,
 0.14303239192265577,
 0.1575521978122203,
 0.14586499149789456,
 0.22514075697341696,
 0.27517203630084297,
 0.19184045508446734,
 0.12327841818038446,
 0.3036421933135836,
 0.10156734322380513,
 0.311

# Data cleaning

In [22]:
#Splice into entity-specific sub-groups
actor_scores = sim_scores[:actors_len]
movie_scores = sim_scores[actors_len:actors_len+movies_len]
character_scores = sim_scores[actors_len+movies_len:]

In [23]:
print(actor_list[:3])
print(actor_scores[:3])

["'Big Boo' Brett Mitchell", "'Foul Ball' Paul Jones", "'Freeway' Ricky Ross"]
[0.24921971504234075, 0.16461975596431738, 0.3031746915839468]


In [24]:
#Suspiciously JSON-like formatting
actor_dict = [{"type":"actor", "value":a, "similarity_score":s} for a, s in zip(actor_list, actor_scores)]
movie_dict = [{"type":"movie", "value":m, "similarity_score":s} for m, s in zip(movie_list, movie_scores)]

#For each character score, put in actor's name
actor_character_dict = []
for character, score in zip(character_list,character_scores):
    for actor in character_dict[character]:
        actor_character_dict.append({"type":"actor", "value":actor, "similarity_score":score})

In [25]:
from operator import itemgetter

In [26]:
all_dict = actor_dict + movie_dict + actor_character_dict
    
#Top 100 items sorted by similarity score
all_dict_sorted = sorted(all_dict, key=itemgetter('similarity_score'), reverse=True)[:100]

In [27]:
all_dict_sorted[:5]

[{'type': 'actor',
  'value': 'Morgan Freeman',
  'similarity_score': 0.7353066331851006},
 {'type': 'actor',
  'value': 'John Freimann',
  'similarity_score': 0.7202386185701499},
 {'type': 'actor',
  'value': 'Martin Freeman',
  'similarity_score': 0.6740310804196752},
 {'type': 'actor',
  'value': 'Griffin Freeman',
  'similarity_score': 0.6627289992476327},
 {'type': 'actor',
  'value': 'Crispin Freeman',
  'similarity_score': 0.6394811726804519},
 {'type': 'actor',
  'value': 'Diane Freiman Reynolds',
  'similarity_score': 0.6339279851951471},
 {'type': 'actor',
  'value': 'Nick Swardson',
  'similarity_score': 0.6320816298242098},
 {'type': 'actor',
  'value': 'John Magaro',
  'similarity_score': 0.6251356778481195},
 {'type': 'actor',
  'value': 'Reid Morgan',
  'similarity_score': 0.618852747755276},
 {'type': 'actor',
  'value': 'Rein Hofman',
  'similarity_score': 0.618852747755276},
 {'type': 'actor',
  'value': 'Don McManus',
  'similarity_score': 0.6163112472933509},
 {'ty

# All-in-one function

In [29]:
#Main function - given user input, return object of similar entities
def getSimilarNames(user_input):

    #Perform cosine similarity calculation between user input and all entities
    sim_scores = pairwise_kernels(
                     vec_transformed,
                     vectorizer.transform([user_input]),
                     metric='cosine').flatten().tolist()
    
    actor_scores = sim_scores[:actors_len]
    movie_scores = sim_scores[actors_len:actors_len+movies_len]
    character_scores = sim_scores[actors_len+movies_len:]
    
    actor_dict = [{"type":"actor", "value":a, "similarity_score":s} for a, s in zip(actor_list, actor_scores)]
    movie_dict = [{"type":"movie", "value":m, "similarity_score":s} for m, s in zip(movie_list, movie_scores)]

    actor_character_dict = []
    for character, score in zip(character_list,character_scores):
        for actor in character_dict[character]:
            actor_character_dict.append({"type":"actor", "value":actor, "similarity_score":score})
    
    all_dict = actor_dict + movie_dict + actor_character_dict
    
    #Top 100 items sorted by similarity score
    all_dict_sorted = sorted(all_dict, key=itemgetter('similarity_score'), reverse=True)[:100]
    
    return all_dict_sorted

In [32]:
%%time

#Provide search to fuzzy lookup
getSimilarNames('thor')[:10]

CPU times: user 367 ms, sys: 123 ms, total: 489 ms
Wall time: 498 ms


[{'type': 'actor',
  'value': 'Christian Bale',
  'similarity_score': 1.0000000000000004},
 {'type': 'actor',
  'value': 'Christian Bables',
  'similarity_score': 0.8797861641347275},
 {'type': 'actor',
  'value': 'Christian Bach',
  'similarity_score': 0.8714204019005979},
 {'type': 'actor',
  'value': 'Christian Baha',
  'similarity_score': 0.8714204019005979},
 {'type': 'actor',
  'value': 'Christian Bernal',
  'similarity_score': 0.8378915848902165},
 {'type': 'actor',
  'value': 'Christian Barillas',
  'similarity_score': 0.8325195283889946},
 {'type': 'actor',
  'value': 'Christian Bayerlein',
  'similarity_score': 0.8321925616372222},
 {'type': 'actor',
  'value': 'Christian Borle',
  'similarity_score': 0.8319334382645821},
 {'type': 'actor',
  'value': 'Marlon Wayans',
  'similarity_score': 0.8144170883512026},
 {'type': 'actor',
  'value': 'Christian Blanch',
  'similarity_score': 0.8139985347071098},
 {'type': 'actor',
  'value': 'Michael Falch',
  'similarity_score': 0.8093