# Load data

In [None]:
import pickle
import numpy as np
import random

In [None]:
#Import actor/movie data
with open('./data/celebs.pickle', 'rb') as handle:
    actor_list = pickle.load(handle, encoding='UTF-8')
with open('./data/movies.pickle', 'rb') as handle:
    movie_list = pickle.load(handle, encoding='UTF-8')
with open('./data/characters.pickle', 'rb') as handle:
    character_dict = pickle.load(handle, encoding='UTF-8')

In [None]:
print(len(actor_list))
random.sample(actor_list, 10)

In [None]:
print(len(movie_list))
random.sample(movie_list, 10)

In [None]:
print(len(character_dict))
character_dict['Bruce Wayne']

# Create master list of entities

In [None]:
#Instantiate vars, transform count vectorizer
character_list = list(character_dict.keys())
all_entities = actor_list + movie_list + character_list

actors_len = len(actor_list)
movies_len = len(movie_list)
character_len = len(character_list)

actors_len + movies_len + character_len #Order matters!

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3)) #All 1-3 character sequences

In [None]:
%%time

vec_transformed = vectorizer.fit_transform(all_entities) #Keep track of transformed object

In [None]:
vec_transformed #Sparse matrix?

In [None]:
print(len(vectorizer.vocabulary_))
random.sample(vectorizer.vocabulary_.items(), 10)

# Cosine similarity function

In [None]:
user_input = "Morgin Freiman"

In [None]:
user_input_transformed = vectorizer.transform([user_input])

In [None]:
dense = user_input_transformed.toarray()
print(len(dense[0]), "items in Morgin Freiman vector")
[t for t in dense[0]][:20]

In [None]:
#See all query tokens
[i for i in vectorizer.inverse_transform(dense)[0]]

In [None]:
#Search for index of a token
search_token = 'mor'
for token, i in vectorizer.vocabulary_.items():
    if token == search_token:
        print(i)

In [None]:
from sklearn.metrics import pairwise_kernels

In [None]:
#Perform cosine similarity calculation between user input and all entities
sim_scores = pairwise_kernels(
                 vec_transformed, # Big sparse matrix of all entities
                 user_input_transformed, # User's query
                 metric='cosine').flatten().tolist()

In [None]:
sim_scores[:10] #What's this?

# Data cleaning

In [None]:
#Splice into entity-specific sub-groups
actor_scores = sim_scores[:actors_len]
movie_scores = sim_scores[actors_len:actors_len+movies_len]
character_scores = sim_scores[actors_len+movies_len:]

In [None]:
print(actor_list[:3])
print(actor_scores[:3])

In [None]:
#Suspiciously JSON-like formatting
actor_dict = [{"type":"actor", "value":a, "similarity_score":s} for a, s in zip(actor_list, actor_scores)]
movie_dict = [{"type":"movie", "value":m, "similarity_score":s} for m, s in zip(movie_list, movie_scores)]

#For each character score, put in actor's name
actor_character_dict = []
for character, score in zip(character_list,character_scores):
    for actor in character_dict[character]:
        actor_character_dict.append({"type":"actor", "value":actor, "similarity_score":score})

In [None]:
from operator import itemgetter

In [None]:
all_dict = actor_dict + movie_dict + actor_character_dict
    
#Top 100 items sorted by similarity score
all_dict_sorted = sorted(all_dict, key=itemgetter('similarity_score'), reverse=True)[:100]

In [None]:
all_dict_sorted[:5]

# All-in-one function

In [None]:
#Main function - given user input, return object of similar entities
def getSimilarNames(user_input):

    #Perform cosine similarity calculation between user input and all entities
    sim_scores = pairwise_kernels(
                     vec_transformed,
                     vectorizer.transform([user_input]),
                     metric='cosine').flatten().tolist()
    
    actor_scores = sim_scores[:actors_len]
    movie_scores = sim_scores[actors_len:actors_len+movies_len]
    character_scores = sim_scores[actors_len+movies_len:]
    
    actor_dict = [{"type":"actor", "value":a, "similarity_score":s} for a, s in zip(actor_list, actor_scores)]
    movie_dict = [{"type":"movie", "value":m, "similarity_score":s} for m, s in zip(movie_list, movie_scores)]

    actor_character_dict = []
    for character, score in zip(character_list,character_scores):
        for actor in character_dict[character]:
            actor_character_dict.append({"type":"actor", "value":actor, "similarity_score":score})
    
    all_dict = actor_dict + movie_dict + actor_character_dict
    
    #Top 100 items sorted by similarity score
    all_dict_sorted = sorted(all_dict, key=itemgetter('similarity_score'), reverse=True)[:100]
    
    return all_dict_sorted

In [None]:
%%time

#Provide search to fuzzy lookup
getSimilarNames('thor')[:30]