In [1]:
import pandas as pd
import numpy as np

from surprise import Dataset, Reader, SVD

import pickle


In [2]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
model = pickle.load(open("svd_model_final.sav", "rb"))
model

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x238861faf70>

In [4]:
pd.DataFrame(model.qi)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.355428,0.086711,0.037307,-0.008528,0.003601,0.062866,0.309012,-0.193954,0.255879,0.075461,...,0.151596,-0.052066,0.026084,-0.180659,0.058057,-0.262858,-0.022482,-0.079360,-0.180894,-0.168850
1,0.039600,0.221500,-0.121480,-0.086343,0.092774,-0.186041,0.039444,-0.327360,-0.186458,0.122287,...,0.104986,-0.148422,-0.020508,0.061155,0.036455,-0.112470,-0.079350,0.345237,-0.280915,-0.056840
2,0.035553,0.296449,-0.052560,-0.096478,0.146144,-0.215660,0.165634,-0.238519,-0.154251,0.202097,...,0.092361,-0.263528,0.009131,0.068094,0.091665,0.028335,-0.060514,0.321258,-0.307469,0.042559
3,-0.052684,0.124052,-0.233798,-0.048548,0.283924,-0.002556,0.208307,-0.004916,-0.117665,0.352862,...,-0.021665,-0.026972,-0.211693,0.036533,0.510261,-0.042089,-0.314043,-0.297400,-0.128107,0.056825
4,-0.113388,0.112877,-0.223308,0.047166,0.287559,-0.182320,0.005369,-0.031051,-0.112730,-0.010081,...,-0.085853,-0.155829,-0.317962,-0.020747,-0.125405,0.170290,-0.004097,-0.137421,0.105730,0.033495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10321,0.064283,-0.066679,0.019670,-0.074904,-0.111898,0.032758,0.007110,-0.237712,0.003386,0.104792,...,0.075875,0.037610,0.044629,0.022485,0.072397,0.073447,-0.038311,0.105090,-0.219770,-0.064506
10322,-0.028149,-0.237368,-0.127044,0.112757,-0.053829,0.175344,0.077252,-0.021320,-0.135519,0.055556,...,0.073472,-0.068247,0.036520,0.144946,0.175616,0.174793,-0.046194,0.096464,0.095503,0.009346
10323,0.019206,0.045777,0.037523,-0.003447,0.081681,-0.028402,-0.083033,0.000999,-0.038110,0.123682,...,0.041664,0.070854,-0.183184,0.075146,-0.126006,-0.009568,0.081285,-0.017439,-0.058702,0.149930
10324,0.155823,-0.050094,-0.100647,0.069713,-0.116233,0.165296,0.029190,0.042868,-0.198549,-0.038005,...,0.076845,-0.066523,-0.074670,0.330238,0.094218,0.220379,0.030029,0.103854,-0.176127,-0.235026


In [5]:
model.qi.shape

(10326, 200)

In [6]:
from scipy.spatial.distance import cosine as cosine_distance

In [7]:
def get_vector_by_movie_title(raw_id: int, trained_model: SVD) -> np.array:
    """Returns the latent features of a movie in the form of a numpy array"""
    movie_row_idx = trained_model.trainset._raw2inner_id_items[raw_id]
    return trained_model.qi[movie_row_idx]

In [12]:
def get_recs(liked_movie_title: str, model: SVD) -> pd.DataFrame:
    try:
        """Returns the top 25 most similar movies to a specified movie
        
        This function iterates over every possible movie in MovieLens and calculates
        distance between `movie_title` vector and that movie's vector.
        """
        liked_movie_raw_id = movies[movies['title']==liked_movie_title]["movieId"].item()
        # Get the first movie vector
        movie_vector: np.array = get_vector_by_movie_title(liked_movie_raw_id, model)
        similarity_table = []
        
        # Iterate over every possible movie and calculate similarity
        for other_movie_raw_id in model.trainset._raw2inner_id_items.keys():
            other_movie_vector = get_vector_by_movie_title(other_movie_raw_id, model)
            
            # Get the second movie vector, and calculate distance
            similarity_score = cosine_distance(other_movie_vector, movie_vector)
            recommended_movies = movies[movies['movieId']==other_movie_raw_id]["title"].item()
            if similarity_score != 0:
                similarity_table.append((similarity_score, recommended_movies))
        recs = pd.DataFrame(sorted(similarity_table), columns=["vector cosine distance", "Movie Title"])
        # sort movies by ascending similarity
        return recs.head(25)
    # Exception for if there isnt enough info about the movie
    except:
        print("Not enough info about movie")


In [13]:
get_recs("Harry Potter and the Goblet of Fire (2005)", model)

Unnamed: 0,vector cosine distance,Movie Title
0,0.022296,Harry Potter and the Order of the Phoenix (2007)
1,0.028553,Harry Potter and the Half-Blood Prince (2009)
2,0.034266,Harry Potter and the Prisoner of Azkaban (2004)
3,0.046243,Harry Potter and the Chamber of Secrets (2002)
4,0.049955,Harry Potter and the Deathly Hallows: Part 1 (...
5,0.052684,Harry Potter and the Deathly Hallows: Part 2 (...
6,0.057276,Harry Potter and the Sorcerer's Stone (a.k.a. ...
7,0.62827,Fantastic Beasts and Where to Find Them (2016)
8,0.629601,Fantastic Beasts and Where to Find Them 2 (2018)
9,0.687343,"Chronicles of Narnia: The Lion, the Witch and ..."


In [14]:
get_recs("Notebook, The (2004)", model)

Unnamed: 0,vector cosine distance,Movie Title
0,0.591137,Titanic (1997)
1,0.6066,P.S. I Love You (2007)
2,0.626739,Pretty Woman (1990)
3,0.634976,"Holiday, The (2006)"
4,0.640392,"Vow, The (2012)"
5,0.651908,Sweet Home Alabama (2002)
6,0.6566,Dear John (2010)
7,0.661944,Soul Surfer (2011)
8,0.666143,"Sisterhood of the Traveling Pants, The (2005)"
9,0.671852,"Wedding Date, The (2005)"


In [15]:
get_recs("Scream - Because I Will Kill You! (1999)", model)

Not enough info about movie


In [20]:
get_recs("Little Women (1994)", model)

Unnamed: 0,vector cosine distance,Movie Title
0,0.514125,Sense and Sensibility (1995)
1,0.51689,Emma (1996)
2,0.615469,Much Ado About Nothing (1993)
3,0.61591,Little Women (1949)
4,0.617157,"Secret Garden, The (1993)"
5,0.659792,Pride and Prejudice (1940)
6,0.661413,Mansfield Park (1999)
7,0.662151,Little Women (1933)
8,0.665804,Ever After: A Cinderella Story (1998)
9,0.671469,Where the Heart Is (1990)
