In [7]:
# source code https://www.analyticsvidhya.com/blog/2020/11/create-your-own-movie-movie-recommendation-system/
# datasets https://grouplens.org/datasets/movielens/latest/

In [8]:
# import necessary packages
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# import movies dataset from https://grouplens.org/datasets/movielens/latest/
movies = pd.read_csv("movies.csv")

# import ratings dataset from https://grouplens.org/datasets/movielens/latest/
ratings = pd.read_csv("ratings.csv")

# make a new dataframe using the movies and rating data
# columns represent each unique userId  
# rows represent each unique movieId.
dataset = ratings.pivot(index='movieId',columns='userId',values='rating')

# replace NaN with 0
dataset.fillna(0,inplace=True)

# aggregating the number of users who voted and the number of movies that were voted.
no_user_voted = ratings.groupby('movieId')['rating'].agg('count')
no_movies_voted = ratings.groupby('userId')['rating'].agg('count')

In [10]:
# this is the function we will use to recommend a desired number of movies based certain threshold criteria
# user_threshodld is the criteria for how many people should have voted for that movie to qualify that movie
# movie_threshold is the number of movies a user needs to have voted for to qualify their opinions
# number_to_recommend is the number of movies we want to recommendation system to output
# plot = 'off' is the default setting, can change this to 'on' to visualize the voting based on the thresholds

def function(user_threshold, movie_threshold, movie_name, number_to_recommend, plot = 'off'):
    
    if plot == 'on':
        # visualize the number of users who voted with the [user threshold] 
        f,ax = plt.subplots(1,1,figsize=(8,4))
        plt.scatter(no_user_voted.index,no_user_voted,color='blue')
        plt.axhline(y=user_threshold,color='r')
        plt.xlabel('MovieId')
        plt.ylabel('No. of users voted')
        plt.show()

        # visualize the number of votes by each user with the [movie threshold]
        f,ax = plt.subplots(1,1,figsize=(8,4))
        plt.scatter(no_movies_voted.index,no_movies_voted,color='green')
        plt.axhline(y=movie_threshold,color='r')
        plt.xlabel('UserId')
        plt.ylabel('No. of votes by user')
        plt.show()
    
    # modify the dataset based on the user threshold
    # to qualify a movie, a minimum of [user threshold] many users should have voted for that movie
    final_dataset = dataset.loc[no_user_voted[no_user_voted > user_threshold].index,:]
    
    # modify the dataset based on the movie threshold
    # to qualify a user, a minimum of [movie threshold] many movies movies should have voted by the user
    final_dataset=final_dataset.loc[:,no_movies_voted[no_movies_voted > movie_threshold].index]
    
    # remove sparsity using csr_matrix
    csr_data = csr_matrix(final_dataset.values)
    final_dataset.reset_index(inplace=True)
    
    # using the KNN algorithm to compute similarity with cosine distance metric 
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    knn.fit(csr_data)
    
    # movie recommendation
    # check if the movie name input is in the database 
    # if it is we use our recommendation system to find similar movies
    # sort these based on their similarity distance 
    # output only the top [number to recommend] many movies with their distances from the input movie
    
    n_movies_to_reccomend = number_to_recommend
    movie_list = movies[movies['title'].str.contains(movie_name)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['movieId']
        movie_idx = final_dataset[final_dataset['movieId'] == movie_idx].index[0]
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_dataset.iloc[val[0]]['movieId']
            idx = movies[movies['movieId'] == movie_idx].index
            recommend_frame.append({'Title':movies.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input" 

In [11]:
# this is Dan's Netfix Viewing History
NVHD = pd.read_csv('NetflixViewingHistoryDan.csv')
NVHD

Unnamed: 0,Title,Date
0,Big Fish,3/28/22
1,Monsters vs. Aliens,3/19/22
2,Brand New Cherry Flavor: Limited Series: Tadpo...,3/10/22
3,Brand New Cherry Flavor: Limited Series: I Exist,3/10/22
4,Starship Troopers,3/9/22
...,...,...
973,Battle for Haditha,3/14/15
974,Blackfish,3/14/15
975,The Immigrant,2/24/15
976,Elsa & Fred,2/23/15


In [14]:
# This is Dan's Netflix Viewing History Home
NVHH = pd.read_csv('NetflixViewingHistoryHome.csv')
NVHH  

Unnamed: 0,Title,Date
0,Bridgerton: Season 2: Capital R Rake,3/29/22
1,The Adam Project,3/26/22
2,Bridgerton: Season 1: After the Rain,3/25/22
3,Shameless (U.S.): Season 1: Frank Gallagher: L...,3/24/22
4,Cobra Kai: Season 2: Pulpo,3/20/22
...,...,...
2785,Shake It Up: Season 1: Start It Up!,10/8/12
2786,MythBusters: Collection 1: Exploding Toilet,10/5/12
2787,Iron Man 2,10/4/12
2788,Hachi: A Dog's Tale,10/4/12


In [21]:
# user_threshold, movie_threshold, movie_name, number_to_recommend, plot = 'off'
function(1, 1, 'Starship Troopers', 10, plot = 'off')
# function(20, 50, 'Guardians of the Galaxy', 10, plot = 'off')

Unnamed: 0,Title,Distance
1,Predator (1987),0.474892
2,Indiana Jones and the Temple of Doom (1984),0.466659
3,"Terminator, The (1984)",0.466263
4,Air Force One (1997),0.464702
5,Total Recall (1990),0.460437
6,Blade (1998),0.457026
7,Face/Off (1997),0.454711
8,Mars Attacks! (1996),0.453924
9,RoboCop (1987),0.450209
10,"Fifth Element, The (1997)",0.431766
