In [4]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

In [5]:

ratings_list = [i.strip().split("::") for i in open('ml-1m/ratings.dat', 'r').readlines()]
users_list = [i.strip().split("::") for i in open('ml-1m/users.dat', 'r').readlines()]
movies_list = [i.strip().split("::") for i in open('ml-1m/movies.dat', 'r').readlines()]



In [6]:
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = float)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)

In [7]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1.0,1193.0,5.0,978300760.0
1,1.0,661.0,3.0,978302109.0
2,1.0,914.0,3.0,978301968.0
3,1.0,3408.0,4.0,978300275.0
4,1.0,2355.0,5.0,978824291.0


In [8]:
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,3943.0,3944.0,3945.0,3946.0,3947.0,3948.0,3949.0,3950.0,3951.0,3952.0
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
R = R_df.values
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [10]:

U, sigma, Vt = svds(R_demeaned, k = 50)

In [11]:
sigma = np.diag(sigma)

In [12]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [13]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print(f'User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print(f'Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

already_rated, predictions = recommend_movies(preds_df, 837, movies_df, ratings_df, 10)

User 0 has already rated 1 movies.
Recommending the highest 0 predicted ratings movies not already rated.


In [14]:
already_rated.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres
36,837.0,858.0,5.0,975360036.0,"Godfather, The (1972)",Action|Crime|Drama
35,837.0,1387.0,5.0,975360036.0,Jaws (1975),Action|Horror
65,837.0,2028.0,5.0,975360089.0,Saving Private Ryan (1998),Action|Drama|War
63,837.0,1221.0,5.0,975360036.0,"Godfather: Part II, The (1974)",Action|Crime|Drama
11,837.0,913.0,5.0,975359921.0,"Maltese Falcon, The (1941)",Film-Noir|Mystery
20,837.0,3417.0,5.0,975360893.0,"Crimson Pirate, The (1952)",Adventure|Comedy|Sci-Fi
34,837.0,2186.0,4.0,975359955.0,Strangers on a Train (1951),Film-Noir|Thriller
55,837.0,2791.0,4.0,975360893.0,Airplane! (1980),Comedy
31,837.0,1188.0,4.0,975360920.0,Strictly Ballroom (1992),Comedy|Romance
28,837.0,1304.0,4.0,975360058.0,Butch Cassidy and the Sundance Kid (1969),Action|Comedy|Western
