# Movie Recommendor System using k-Nearest Neighbor method

In [None]:
import numpy as np
import os
import math
import matplotlib as plt
from numpy import linalg as LA
import matplotlib.pyplot as plt
from numpy import dot
from numpy.linalg import norm
import pandas as pd
from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Read training data (raw)
1. Read from the .txt files
2. convert to .csv
3. Read the .csv into Pandas dataframe

In [None]:
# DataFrame to store all imported data
if not os.path.isfile('data.csv'):
    data = open('data.csv', mode='w')

files = ['/content/drive/My Drive/NetflixChallenge/archive/combined_data_1.txt',
         #'/content/drive/My Drive/NetflixChallenge/archive/combined_data_2.txt',
     #     '/content/drive/My Drive/NetflixChallenge/archive/combined_data_3.txt',  
       #   '/content/drive/My Drive/NetflixChallenge/archive/combined_data_4.txt'
        ]

# Remove the line with movie_id: and add a new column of movie_id
# Combine all data files into a csv file
for file in files:
    print("Opening file: {}".format(file))
    with open(file) as f:
        for line in f:
            line = line.strip()
            if line.endswith(':'):
                movie_id = line.replace(':', '')
            else:
                size = len(line)
                line = line[:size - 11]
                data.write(movie_id + ',' + line)
                data.write('\n')
data.close()

# Read all data into a pd dataframe
df = pd.read_csv('data.csv', names=['movie_id', 'user_id','rating'])
print("The data count is as follows: ")
print(df.nunique())

# Analyze Training data (raw)
Top 'k' rated movies (this data is extracted for visualization purposes only)

In [None]:
Top_k = 25

group = df.groupby('movie_id')['rating'].count()
Top_k_movies = group.sort_values(ascending=False)[:Top_k]
#print(Top_k_movies)
Top_k_movies.plot.bar();

Top K users (who rated more often)

In [None]:
Top_K_users = 5000
group = df.groupby('user_id')['rating'].count()
Top_K_users = group.sort_values(ascending=False)[:Top_K_users]
print(Top_K_users)

In [None]:
# Keep only Top_K_users for further processing
lite_rating_df = df.join(Top_K_users, rsuffix='_total', how='inner', on='user_id')
print(lite_rating_df.nunique())
lite_rating_df

In [None]:
#Convert dataframe to numpy matrices
X = lite_rating_df[['user_id', 'movie_id']].values
y = lite_rating_df['rating'].values
print(X)
print(y)
print(X.shape)

# Read Test Data

In [None]:
# DataFrame to store all imported data
if not os.path.isfile('Testdata.csv'):
    data = open('Testdata.csv', mode='w')

files = ['/content/drive/My Drive/NetflixChallenge/archive/probe.txt']

# Remove the line with movie_id: and add a new column of movie_id
# Combine all data files into a csv file
for file in files:
  print("Opening file: {}".format(file))
  with open(file) as f:
    for line in f:
        line = line.strip()
        if line.endswith(':'):
            movie_id = line.replace(':', '')
        else:
            size = len(line)
            line = line[:size]
            data.write(movie_id + ',' + line)
            data.write('\n')
data.close()

# Read all data into a pd dataframe
df_Test = pd.read_csv('Testdata.csv', names=['movie_id', 'user_id'])
print("The data count is as follows: ")
print(df_Test.nunique())

# Preprocess Test data

In [None]:
# This pre-processing was necessary because of the pre-processing step that was applied to the Training set
# When pre-processing was applied to the training set, only Top-K users were kept
# So, some of the data was lost from the training set for the users in the Test Set.
# This meant that same pre-processing had to be applied to the Test Set to apply User-User collaborative filtering
lite_rating_df_Test = df_Test.join(Top_K_users, how='inner', on='user_id')
print(lite_rating_df_Test.nunique())
lite_rating_df_Test

Further, prediction was only made for subset of the data in the Test Set for the sake of time and simply to evaluate the efficacy of the method proposed in this project.

In [None]:
TestSet = lite_rating_df_Test[['movie_id', 'user_id']].values
TestSet_truncated = TestSet[TestSet[:,0]<=1000] 
print(TestSet_truncated)
print(TestSet_truncated.shape)

# Predict the ratings of various movies for the users in the Test Data

### Method 1: 
Collaborative Filtering based on User-User Similarity In this method, the user for whom the prediction is to be made is compared with other users in the training set. The method uses Pearson Coefficient to find the similarity of this user with other users. Then, top 'NN' similar users are picked and the final prediction is made based on the similar users. (The 'NN' is defined in the steps below)

In [None]:
from collections import Counter

MAS_cumulative_UserUserMtd1 = 0 # Cumulative Error (Mean Absolute Error)
RMSE_cumulative_UserUserMtd1 = 0 # Cumulatve Error (Root Mean Squared Error)
Total_predictions_UserUserMtd1 = [] #Store all predictions made using Method 1
Actual_rating_ALL = [] # Store all actual predictions
Target_Users_ALL = []
Target_Movie_ALL = []

# Make predictions for all the data in the TestSet_Truncated
#for i in range(0, TestSet_truncated.shape[0]):
for i in range(0,100):
    target_movie = TestSet_truncated[i,:][0]
    target_user = TestSet_truncated[i,:][1]
    Target_Users_ALL.append(target_user)
    Target_Movie_ALL.append(target_movie)

  # Find all the other ratings made by target_user and the corresponding movies
    OtherMoviesRated_target_user = (X[X[:,0]==target_user])[:,1] # movies rated
    OtherRatings_target_user = (y[X[:,0]==target_user]) # ratings for movies rated
    Average_rating_target_user = sum(OtherRatings_target_user )/len(OtherRatings_target_user) #average rating for the target user

  ### Find similarity of the target_user with other users
  # [1] First, find other users who have also watched (one or  more) of the same movies as the target_user
    OtherUsers = [target_user] #Initialize list
    for common_movie in OtherMoviesRated_target_user:
        OtherUsers_temp = (X[X[:,1]==common_movie])[:,0]
        OtherUsers_temp = OtherUsers_temp.tolist()
        OtherUsers = OtherUsers + OtherUsers_temp
        OtherUsers = list(set(OtherUsers)) #Remove duplicate Users form the list
    OtherUsers = np.array(OtherUsers)
    OtherUsersTotal = OtherUsers.shape[0] #Total users who shares movies rated with target_user

  # [2] Next, create a list of movies that have been rated by both (target_user and the other user)
  # Then, use this set to find the similarity between the target user and this user using the Pearson Coefficient
    TotalCommonMovies = [] #a list that stores the movies that the target_user has in common with every other user
    PearsonCoefficient = []
    Average_ratings = [] #a list containing  avg rating for all users
    Rating_for_target_movie = [] #a list containing rating by other users for the target_movie
    for user in OtherUsers:
        movies_rated_ThisUser = (X[X[:,0]==user])[:,1] 
        movies_rated_ThisUser = movies_rated_ThisUser.tolist()
        All_rating_ThisUser = (y[X[:,0]==user]) # ratings for movies rated
        avg_rating_ThisUser = sum(All_rating_ThisUser)/len(All_rating_ThisUser)
        Average_ratings.append(avg_rating_ThisUser)
        All_movies_Set = movies_rated_ThisUser + OtherMoviesRated_target_user.tolist()
        common_movies_Set = ([item for item, count in Counter(All_movies_Set).items() if count > 1])
        TotalCommonMovies.append(len(common_movies_Set))
        target_rating = []
        ThisUser_rating = []
        Xmovies_for_this_user = (X[X[:,0]==user]) # create a subset of X which contains only movies rated by ThisUser
        Yratings_for_this_user = (y[X[:,0]==user]) # Create a subset of y which corresponds to the above subset of X
        Rating_target_movie_ThisUser = (Yratings_for_this_user[Xmovies_for_this_user [:,1]==target_movie]) # rating of the target movie by This User
        if (len(Rating_target_movie_ThisUser)==0):
            Rating_target_movie_ThisUser = 0
        else:
            Rating_target_movie_ThisUser = Rating_target_movie_ThisUser[0]
        Rating_for_target_movie.append(Rating_target_movie_ThisUser)

        for mv in common_movies_Set:
            ratingUser = (Yratings_for_this_user[Xmovies_for_this_user[:,1]==mv])[0]
            ratingTargetUser = OtherRatings_target_user[OtherMoviesRated_target_user==mv][0]
            target_rating.append(ratingTargetUser)
            ThisUser_rating.append(ratingUser)
            target_rating = np.asarray(target_rating)
            ThisUser_rating = np.asarray(ThisUser_rating)
        if (np.unique(target_rating).shape[0]==1 or np.unique(ThisUser_rating).shape[0]==1):
            PearsonCoeff = np.array([[0,0],[0,0]])
        else:
            PearsonCoeff = np.corrcoef(target_rating, ThisUser_rating)
        PearsonCoefficient.append(PearsonCoeff[0,1])

  # [3] Post process the Similarity score.
  # To this end, we know the similarity score between the target_user and every other user who saw the same movies as the target user
  # But the problem is that target_user + UserA may have watched 200 movies together and based on that , the similarity score is 0.2
  # And then target_user + UserB may have watched only 10 movies together and based on that, the similarity score is 0.9
  # Here we cannot simply conclude that UserB is more similar to target user than UserA.
  # So, we have to compensate for the imbalanced data used for finding the similarity scores.
    norm_TotalCommonMovies = [float(z)/sum(TotalCommonMovies) for z in TotalCommonMovies]
    Corrected_PearsonCoefficient = np.multiply(PearsonCoefficient,norm_TotalCommonMovies)

  # [4] Select Top 50 nearest neighbors, whenever avaialble, otherwise just select all the neighbors if neighbors are less than 50
    NN = 50
    if (np.array(Corrected_PearsonCoefficient).shape[0]<50):
        NN = (np.array(Corrected_PearsonCoefficient).shape[0])

    TopNNneighers_target_User = np.argpartition(Corrected_PearsonCoefficient, -NN)[-NN:] # This gives the indices of top NN Pearson Coefficients

  # [5] Predict the ratings of the target_user using the weighted average of NN nearest neighbors 
    num=0
    den=0
    itr=0
    for idx in TopNNneighers_target_User:
        num = num + Corrected_PearsonCoefficient[idx]* (Rating_for_target_movie[idx] - Average_ratings[idx])
        den = den + Corrected_PearsonCoefficient[idx] 
    prediction = Average_rating_target_user + num/den # This is the predicted rating for the target_user
    Total_predictions_UserUserMtd1.append(prediction)

  # [6] Find the error in the prediction
  # Compare the predicted rating to the actual rating
    Actual_rating = (OtherRatings_target_user[OtherMoviesRated_target_user==target_movie])[0]
    Actual_rating_ALL.append(Actual_rating) #Store all actual predictions
    err = (Actual_rating - prediction)
    MAS_cumulative_UserUserMtd1 = MAS_cumulative_UserUserMtd1 + abs(err) 
    RMSE_cumulative_UserUserMtd1 = RMSE_cumulative_UserUserMtd1+ err*err 

### Method 1: Evaluation

In [None]:
# Final error for all predictions
MAS = MAS_cumulative_UserUserMtd1/len(Total_predictions_UserUserMtd1)
RMSE = np.sqrt(RMSE_cumulative_UserUserMtd1/len(Total_predictions_UserUserMtd1))
print("The Mean Absolute Error for User-User Similarity Collaborative Filtering is: ", MAS)
print("The Root Mean Squared Error for User-User Similarity Collaborative Filtering is: ", RMSE)

print(Total_predictions_UserUserMtd1) #Store all predictions made using Method 1
print(Actual_rating_ALL) # Store all actual predictions
#print(Target_Users_ALL)
#print(Target_Movie_ALL)

The Mean Absolute Error for User-User Similarity Collaborative Filtering is:  1.8464365047616664

The Root Mean Squared Error for User-User Similarity Collaborative Filtering is:  2.2133092950241102

### Method 2: Movie - Movie Similarity

Training Set

In [None]:
# Keep only Top 5000 users further processing
lite_rating_df = pd.DataFrame()
group = df.groupby('user_id')['rating'].count()
top_users = group.sort_values(ascending=False)[:5000]
group = df.groupby('movie_id')['rating'].count()
top_movies = group.sort_values(ascending=False)[:100]
lite_rating_df = df.join(top_users, rsuffix='_r', how='inner', on='user_id')
#lite_rating_df = lite_rating_df.join(top_movies, rsuffix='_r', how='inner', on='movie_id')
print(lite_rating_df.nunique())
#lite_rating_df

In [None]:
#Convert dataframe to numpy matrices
X = lite_rating_df[['user_id', 'movie_id']].values
y = lite_rating_df['rating'].values
print(X.shape)

print(X)
print(y)
print(X.shape)

Test set

In [None]:
# This pre-processing was necessary because of the pre-processing step that was applied to the Training set
# When pre-processing was applied to the training set, only Top-K users were kept
# So, some of the data was lost from the training set for the users in the Test Set.
# This meant that same pre-processing had to be applied to the Test Set to apply User-User collaborative filtering

lite_rating_df_Test = pd.DataFrame()
lite_rating_df_Test = df_Test.join(top_users, how='inner', on='user_id')
#lite_rating_df_Test = lite_rating_df_Test.join(top_movies, how='inner',lsuffix='_left', rsuffix='_right', on='movie_id')
print(lite_rating_df_Test.nunique())
#lite_rating_df_Test

In [None]:
TestSet = lite_rating_df_Test[['movie_id', 'user_id']].values
TestSet_truncated = TestSet[TestSet[:,0]<=1000] 
#print(TestSet_truncated)
print(TestSet_truncated.shape)

In [None]:
from collections import Counter

Unique_users = np.unique(X[:,0])
MAS_cumulative_UserUserMtd2 = 0 # Cumulative Error (Mean Absolute Error)
RMSE_cumulative_UserUserMtd2 = 0 # Cumulatve Error (Root Mean Squared Error)
Total_predictions_UserUserMtd2 = [] #Store all predictions made using Method 2
Actual_rating_ALL = [] # Store all actual predictions
Target_Users_ALL = []
Target_Movie_ALL = []
user_movie_matrix = np.zeros((3,5000)) #we have 5000 unique users, so 5000 maximum possible ratings for a particular movie
user_movie_matrix[0,:] = Unique_users

# Make predictions for all the data in the TestSet_Truncated
#for i in range(0, TestSet_truncated.shape[0]):
for i in range(0,100):
    target_movie = TestSet_truncated[i,:][0]
    target_user = TestSet_truncated[i,:][1]
    Target_Users_ALL.append(target_user)
    Target_Movie_ALL.append(target_movie)

  #Find all the other ratings made by target_user and the corresponding movies
    OtherMoviesRated_target_user = (X[X[:,0]==target_user])[:,1] # movies rated
    OtherRatings_target_user = (y[X[:,0]==target_user]) # ratings for movies rated
    Average_rating_target_user = sum(OtherRatings_target_user )/len(OtherRatings_target_user) #average rating for the target user

  #### Find all the users who have rated the target_movie and their ratings for this movie
    All_ratings_target_movie = (y[X[:,1]==target_movie])# ratings
    All_users_target_movie = (X[X[:,1]==target_movie])[:,0]# users
    for pp in range(0,len(All_ratings_target_movie)):
        row = 1
        col = np.where(user_movie_matrix[0,:]==All_users_target_movie[pp])[0][0]
        user_movie_matrix[row,col] = All_ratings_target_movie[pp] 

  #Next, compare target movie with other movies that this target_user has watched.
  # [1] Find the Pearson coefficients
    Pearson = []
    Average_rating_movies = []
    for zz in range(0,len(OtherMoviesRated_target_user)):
        other_movie = OtherMoviesRated_target_user[zz]
        User_who_rated_other_movie = (X[X[:,1]==other_movie])[:,0]#users
        Users_ratings_other_movie =  (y[X[:,1]==other_movie]) #ratings
        Average_other_movie = sum(Users_ratings_other_movie)/len(Users_ratings_other_movie) #average rating for this movie
        Average_rating_movies.append(Average_other_movie)
        for nn in range(0,len(User_who_rated_other_movie)):
            row = 2
            col = np.where(user_movie_matrix[0,:]==User_who_rated_other_movie[nn])[0][0]
            user_movie_matrix[row,col] = Users_ratings_other_movie[nn]
        #Find Pearson coeffiecnt between the target_movie and other_movie
        PearsonCoeff = np.corrcoef(user_movie_matrix[1,:], user_movie_matrix[2,:])
        Pearson.append(PearsonCoeff[0,1])

  # [2] Select Top 50 nearest neighbors, whenever avaialble, otherwise just select all the neighbors if neighbors are less than 50
    NN = 50
    if (np.array(Pearson).shape[0]<50):
        NN = (np.array(Pearson).shape[0])

    TopNNneighers_target_Movie = np.argpartition(Pearson, -NN)[-NN:] # This gives the indices of top NN Pearson Coefficients

  # [3] Predict the ratings of the target_user using the weighted average of NN nearest neighbors (movies)
    num=0
    den=0
    itr=0
    for idx in TopNNneighers_target_Movie:
        num = num + Pearson[idx]* (OtherRatings_target_user[idx] - Average_rating_movies[idx])  
        den = den + Pearson[idx] 
    prediction = Average_rating_target_user + num/den # This is the predicted rating for the target_user
    Total_predictions_UserUserMtd2.append(prediction)

  # [4] Find the error in the prediction
  # Compare the predicted rating to the actual rating
    Actual_rating = (OtherRatings_target_user[OtherMoviesRated_target_user==target_movie])[0]
    Actual_rating_ALL.append(Actual_rating) #Store all actual predictions
    err = (Actual_rating - prediction)
    MAS_cumulative_UserUserMtd2 = MAS_cumulative_UserUserMtd2 + abs(err) 
    RMSE_cumulative_UserUserMtd2 = RMSE_cumulative_UserUserMtd2+ err*err 

### Method 2: Evaluation

In [None]:
# Final error for all predictions
MAS = MAS_cumulative_UserUserMtd2/len(Total_predictions_UserUserMtd2)
RMSE = np.sqrt(RMSE_cumulative_UserUserMtd2/len(Total_predictions_UserUserMtd2))
print("The Mean Absolute Error for Movie-Movie Similarity Collaborative Filtering is: ", MAS)
print("The Root Mean Squared Error for Movie-Movie Similarity Collaborative Filtering is: ", RMSE)

print(Total_predictions_UserUserMtd2) #Store all predictions made using Method 1
print(Actual_rating_ALL) # Store all actual predictions
print(Target_Users_ALL)
print(Target_Movie_ALL)

The Mean Absolute Error for Movie-Movie Similarity Collaborative Filtering is:  0.777954443277451
    
The Root Mean Squared Error for Movie-Movie Similarity Collaborative Filtering is:  1.0604772353326954