## Reference : https://beckernick.github.io/matrix-factorization-recommender/

In [1]:
import pandas as pd
import numpy as np

In [2]:
file1 = open("u.genre.txt","r")
file2 = open("u.item.txt","r")
file3 = open("u.data.txt","r")

In [3]:
genre = file1.read()
item = file2.read()
data = file3.read()

In [4]:
genre_list = genre.split('\n')
item_list = item.split('\n')
data_list = data.split('\n')

In [6]:
# Temporary list
dl=[] # data_list
gl = [] # genre_list
il = [] # item_list

# convert file data into array
for i in data_list:
    t = i.split('\t')
    dl.append(t)

for i in genre_list:
    t = i.split('|')
    gl.append(t)
    
for i in item_list:
    t = i.split('|')
    il.append(t)



In [7]:
# Column names for all list
data_col_names = ["UserID", "MovieID" , "Rating", "Timestamp"]
genre_col_names = ["label", "code"]
item_col_names = ["movie id", "movie title", "release date", "video release date", "IMDb URL", "unknown", "Action", "Adventure", "Animation", "Children's",
"Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir","Horror", "Musical", " Mystery", "Romance", "Sci-Fi", "Thriller",
"War","Western"]

In [8]:
# Converting 
ratings_df = pd.DataFrame(dl, columns = data_col_names)
genre_df = pd.DataFrame(gl, columns = genre_col_names)
item_df = pd.DataFrame(il, columns = item_col_names)

In [9]:
tmp = item_df.apply(lambda row: row[row == '1'].index, axis=1)
genre = []
for index, row in item_df.iterrows():
    tmplst = list(tmp[index])
    if 'movie id' in tmplst:
        tmplst.remove('movie id')
    genre.append('|'.join(tmplst))

In [10]:
item_df.insert(loc=5, column='genres', value=genre)

In [11]:
item_df.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,genres,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,Animation|Children's|Comedy,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,Action|Adventure|Thriller,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,Thriller,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,Action|Comedy|Drama,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),Crime|Drama|Thriller,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
movies_df = pd.DataFrame({'MovieID':item_df['movie id'].apply(pd.to_numeric), 'Title':item_df['movie title'], 'Genres': item_df['genres']})

In [13]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,GoldenEye (1995),Action|Adventure|Thriller
2,3,Four Rooms (1995),Thriller
3,4,Get Shorty (1995),Action|Comedy|Drama
4,5,Copycat (1995),Crime|Drama|Thriller


In [14]:
ratings_df['UserID'].apply(pd.to_numeric)
ratings_df['MovieID'].apply(pd.to_numeric)
ratings_df['Rating'].apply(pd.to_numeric)
ratings_df['Timestamp'].apply(pd.to_numeric)

0        881250949
1        891717742
2        878887116
3        880606923
4        886397596
           ...    
99995    880175444
99996    879795543
99997    874795795
99998    882399156
99999    879959583
Name: Timestamp, Length: 100000, dtype: int64

In [15]:
# reformating ratings_df to one row per user and one column per movie
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,4,0,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
101,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,3,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [16]:
R = R_df.values
R = R.astype(int)

In [17]:
user_ratings_mean = np.mean(R, axis = 1)

In [18]:
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [19]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [20]:
sigma = np.diag(sigma)

In [21]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [23]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False))

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    print(user_full)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
#     recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
#          merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
#                left_on = 'MovieID',
#                right_on = 'MovieID').
#          rename(columns = {user_row_number: 'Predictions'}).
#          sort_values('Predictions', ascending = False).
#                        iloc[:num_recommendations, :-1])

    return user_full #, recommendations



In [25]:
already_rated, predictions = recommend_movies(preds_df, 100, movies_df, ratings_df, 10)

User 100 has already rated 0 movies.
Recommending the highest 10 predicted ratings movies not already rated.


ValueError: too many values to unpack (expected 2)