In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json
import os
import pickle

DATA_DIR = "./data/"

In [2]:
# contruct dict of movie titles and overviews
movies = []
script_path = DATA_DIR + "parsed_scripts/"
for movie_json in os.listdir(script_path):
    if movie_json[-5:] == ".json":
        file_path = script_path + movie_json
        data = json.load(open(file_path, "r"))["movie_metadata"]
        
        # add weight to title
        synopsis = data["synopsis"] + ((data["name"] + " ") * 10)
        
        movies.append({"slug": movie_json[:-5], "title": data["name"], "synopsis": synopsis})

num_movies = len(movies)
print("Loaded {} movie transcripts".format(num_movies))
print("Each movie transcript is a dictionary with the following keys...")
print(movies[0].keys())

Loaded 757 movie transcripts
Each movie transcript is a dictionary with the following keys...
dict_keys(['slug', 'title', 'synopsis'])


In [3]:
# build tf-idf of data
tfidf_vec = TfidfVectorizer(max_df=.8, min_df=10, norm="l2")
tfidf_vec.fit([movie["synopsis"] for movie in movies])
vocab = set(tfidf_vec.get_feature_names())

# add title text to vocab
for movie in movies:
    vocab.update(movie["title"].lower().split(" "))
    
tfidf_vec = TfidfVectorizer(max_df=.8, min_df=10, norm="l2", vocabulary=vocab)
doc_by_vocab = tfidf_vec.fit_transform([movie["synopsis"] for movie in movies]).toarray()
index_to_vocab = {i:v for i, v in enumerate(tfidf_vec.get_feature_names())}

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [4]:
def calc_cossim(query_vec, movie_vec):
    norm1 = np.sum(query_vec**2)**.5
    norm2 = np.sum(movie_vec**2)**.5
    return query_vec.dot(movie_vec) / (norm1 * norm2)

In [5]:
def rank_movies(query):
    query_vec = tfidf_vec.transform([query]).toarray()[0]
    scores = np.apply_along_axis(lambda x: calc_cossim(query_vec, x), 1, doc_by_vocab)
    ranking = [movies[index]["slug"] for index in np.argsort(-scores)]
    return ranking

In [6]:
rank_movies("rider ghost spirit")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


['ghost-rider',
 'ghost',
 'ghost-world',
 'ghost-ship',
 'the-ghost-and-the-darkness',
 'ghostbusters',
 'how-to-train-your-dragon-2',
 'stir-of-echoes',
 'office-space',
 'lord-of-war',
 'a-serious-man',
 'semi-pro',
 'pitch-black',
 'american-sniper',
 'machete',
 'black-swan',
 'swordfish',
 'friday-the-13th-part-viii-jason-takes-manhattan',
 'killing-zoe',
 'insidious',
 'next-friday',
 'american-graffiti',
 'remember-me',
 'kids',
 'wanted',
 'traffic',
 'x-men-origins-wolverine',
 'oceans-twelve',
 'interstellar',
 'jerry-maguire',
 'i-robot',
 'a-most-violent-year',
 'despicable-me-2',
 'big-eyes',
 'the-french-connection',
 'the-world-is-not-enough',
 'the-addams-family',
 'the-deer-hunter',
 'the-queen',
 'mud',
 'cars-2',
 'wild-wild-west',
 'sunshine-cleaning',
 'armageddon',
 '17-again',
 'high-fidelity',
 'existenz',
 'liar-liar',
 'the-wizard-of-oz',
 'frequency',
 'man-on-fire',
 'hard-rain',
 'leaving-las-vegas',
 'lake-placid',
 'get-low',
 'yes-man',
 'v-for-vendetta

In [7]:
ranker = {
    "tf-idf": doc_by_vocab,
    "movies": movies,
    "transformer": tfidf_vec,
}
pickle.dump(ranker, open(DATA_DIR + "tf_idf.pkl", "wb+"), protocol=2)

# also write to app data
pickle.dump(ranker, open("../app/data/tf_idf.pkl", "wb+"), protocol=2)