In [49]:
# Import libraries needed
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt 
import joblib

In [5]:
ratings = pd.read_csv("/home/paul/Mindscope/Mindscope/Movie-Recomendation-System-FastAPI-React-/backend/ml-latest-small/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies = pd.read_csv("/home/paul/Mindscope/Mindscope/Movie-Recomendation-System-FastAPI-React-/backend/ml-latest-small/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
df = ratings.merge(movies, on="movieId", how="left")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [9]:
df.shape

(100836, 6)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 4.6+ MB


In [20]:
# Get value counts for each user and each movie
users_count = df["userId"].value_counts()
movies_count = df["movieId"].value_counts()


users = users_count[users_count >= 100].index
movies = movies_count[movies_count >= 50].index

In [21]:
# Make a new data frame
df = df.loc[(df["userId"].isin(users.values)) & (df["movieId"].isin(movies.values))]
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [22]:
# Drop duplicate reviews
df = df.drop_duplicates(["userId", "title"])
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [23]:
# Check for null values
df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [24]:
df = df.dropna(how="any")

In [25]:
df.shape

(31435, 6)

In [27]:
df_pivot = df.pivot(index = "title", columns="userId", values="rating").fillna(0)
df_pivot.head(3)

userId,1,4,6,7,10,15,17,18,19,20,...,600,601,602,603,604,605,606,607,608,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,3.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,3.0,0.0,...,4.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,4.5


In [30]:
# create a matrix
df_matrix = csr_matrix(df_pivot.values)
df_matrix

<450x248 sparse matrix of type '<class 'numpy.float64'>'
	with 31435 stored elements in Compressed Sparse Row format>

In [31]:
nn = NearestNeighbors(metric="cosine")
nn.fit(df_matrix)

In [52]:
filepath = "/home/paul/Mindscope/Mindscope/Movie-Recomendation-System-FastAPI-React-/backend/notebooks/knnRecom_model"

# Source, destination
joblib.dump(nn, filepath)

['/home/paul/Mindscope/Mindscope/Movie-Recomendation-System-FastAPI-React-/backend/notebooks/knnRecom_model']

In [53]:
# Function to return recommended movies - This will be put to test
def get_recommended_movies(movie = ""):
    # Create a list to put our model outputs into
    recommended_movies = [movie, []]
    
    # Pass thour our movie
    distance, movie_info = nn.kneighbors([df_pivot.loc[movie]], 6, return_distance=True)
    
    # Gather the text and distance & reverse the order
    recom_movie_info = df_pivot.iloc[np.flip(movie_info[0])[:-1]].index.to_list()
    recom_distance = list(np.flip(distance[0])[:-1])
    
    # For each value in our two variables append to our empty movie list above
    for r in zip(recom_movie_info, recom_distance):
        recommended_movies[1].append(list(r))
        
    return recommended_movies
    

> Grumpier Old Men (1995)

> Heat (1995)

In [54]:
get_recommended_movies("Grumpier Old Men (1995)")

['Grumpier Old Men (1995)',
 [['Tombstone (1993)', 0.49693010033317686],
  ['Dumb & Dumber (Dumb and Dumber) (1994)', 0.4903369491963304],
  ["City Slickers II: The Legend of Curly's Gold (1994)", 0.4745474407376241],
  ['Nutty Professor, The (1996)', 0.4707500358819736],
  ['Tommy Boy (1995)', 0.4098912500027274]]]

In [55]:
get_recommended_movies("Heat (1995)")

['Heat (1995)',
 [['Pulp Fiction (1994)', 0.4241238045319198],
  ['Braveheart (1995)', 0.4220836658185725],
  ['Fargo (1996)', 0.4187649676026869],
  ['Fugitive, The (1993)', 0.41449791490613563],
  ['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 0.3986696203976553]]]

In [56]:
get_recommended_movies("Usual Suspects, The (1995)")

['Usual Suspects, The (1995)',
 [['Godfather, The (1972)', 0.296000066874161],
  ['Fargo (1996)', 0.2949988048797817],
  ['Goodfellas (1990)', 0.28834666771532846],
  ['Reservoir Dogs (1992)', 0.26835682299902974],
  ['Pulp Fiction (1994)', 0.22377632151857574]]]

In [57]:
movies = get_recommended_movies("Heat (1995)")
print(movies)

def test_movie_recommendation():
    test_pass = True
    recommends = get_recommended_movies("Heat (1995)")
    if recommends[0] != "Heat (1995)":
        test_pass = False
    recommended_movies = ["Pulp Fiction (1994)", 
                          "Braveheart (1995)",
                          "Fargo (1996)",
                          "Fugitive, The (1993)"]
    recommended_movies_dist = [0.424, 0.422, 0.419, 0.414]
    
    for i in range(2):
        if recommends[1][i][0] not in recommended_movies:
            test_pass = False
        if abs(recommends[1][i][1] - recommended_movies_dist[i]) >= 0.05:
            test_pass = False
    if test_pass:
        print("You passed the challenge!")
    else:
        print("You haven't passed yet. Keep trying!")
        

test_movie_recommendation()

['Heat (1995)', [['Pulp Fiction (1994)', 0.4241238045319198], ['Braveheart (1995)', 0.4220836658185725], ['Fargo (1996)', 0.4187649676026869], ['Fugitive, The (1993)', 0.41449791490613563], ['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 0.3986696203976553]]]
You passed the challenge!


In [58]:
# Load the model
nn_model = joblib.load(filepath)
nn_model

In [60]:
# Function to return recommended movies - This will be put to test
def get_recommended_movies(movie = ""):
    # Create a list to put our model outputs into
    recommended_movies = [movie, []]
    
    # Pass thour our movie
    distance, movie_info = nn_model.kneighbors([df_pivot.loc[movie]], 6, return_distance=True)
    
    # Gather the text and distance & reverse the order
    recom_movie_info = df_pivot.iloc[np.flip(movie_info[0])[:-1]].index.to_list()
    recom_distance = list(np.flip(distance[0])[:-1])
    
    # For each value in our two variables append to our empty movie list above
    for r in zip(recom_movie_info, recom_distance):
        recommended_movies[1].append(list(r))
        
    return recommended_movies

In [61]:
get_recommended_movies("Heat (1995)")

['Heat (1995)',
 [['Pulp Fiction (1994)', 0.4241238045319198],
  ['Braveheart (1995)', 0.4220836658185725],
  ['Fargo (1996)', 0.4187649676026869],
  ['Fugitive, The (1993)', 0.41449791490613563],
  ['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 0.3986696203976553]]]