In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [2]:
# Read in the data
nx = pd.read_csv('resources/netflix/adjusted_ratings.csv', index_col=0)
nx.head()

Unnamed: 0,CustId,Rating,MovieId
52762701,1,4,9608
1499499,1,3,312
77896920,1,4,14171
18093543,1,3,3439
8789496,1,2,1754


In [3]:
# read movies
movies = pd.read_csv('resources/netflix/adjusted_movies.csv')
movies.head()

Unnamed: 0,MovieId,ReleaseYear,MovieTitle
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [4]:
X_train, X_test, y_train, y_test = train_test_split(nx, nx.Rating, test_size=0.25, stratify=nx.Rating, random_state=42)

In [8]:
df_movie_features = X_train.pivot(
    index='MovieId',
    columns='CustId',
    values='Rating'
).fillna(0)

mat_movie_features = csr_matrix(df_movie_features.values)
df_movie_features.head()

CustId,1,2,3,4,5,6,7,8,9,10,...,1205,1206,1207,1208,1209,1210,1211,1212,1213,1214
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1)
model_knn.fit(df_movie_features)

In [18]:
def make_recommendations(movie_id: int, amount: int, model: NearestNeighbors, data: pd.DataFrame):
    """
    Makes movie recommendations based on a movie id.
    """
    # get the distances and indices of the 10 nearest neighbors
    distances, indices = model.kneighbors(data.iloc[movie_id, :].values.reshape(1, -1), n_neighbors=amount + 1)
    # get the movie ids of the nearest neighbors
    movie_ids = data.iloc[indices[0], :].index.values
    # get the movie titles of the nearest neighbors
    movie_titles = []
    release_years = []
    for movie_id in movie_ids:
        movie_titles.append(movies[movies['MovieId'] == movie_id]['MovieTitle'].values[0])
        release_years.append(movies[movies['MovieId'] == movie_id]['ReleaseYear'].values[0])
        
    # create a dataframe of the nearest neighbors
    recommendations = pd.DataFrame({'MovieId': movie_ids[1:], 'Title': movie_titles[1:], 'Release Year': release_years[1:], 'Distance': distances[0][1:]})
    return recommendations

In [27]:
make_recommendations(13672, 10, model_knn, df_movie_features)

Unnamed: 0,MovieId,Title,Release Year,Distance
0,16260,Assault on Precinct 13,2005.0,0.456725
1,8393,Ladder 49,2004.0,0.461286
2,15149,Walking Tall,2004.0,0.462661
3,1220,Man on Fire,2004.0,0.464367
4,273,Taxi,2004.0,0.469925
5,10906,Cellular,2004.0,0.472257
6,406,Hostage,2005.0,0.472548
7,16467,Alien vs. Predator,2004.0,0.474119
8,6673,Godsend,2004.0,0.47679
9,17328,The Punisher,2004.0,0.478085


In [23]:
def get_movie_id(title: str, data: pd.DataFrame):
    """
    Gets the movie id of a movie based on its title.
    """
    return data[data['MovieTitle'] == title]['MovieId'].values[0]

In [24]:
get_movie_id('Toy Story', movies)

13672