In [1]:
# Hybrid Recommender Systems 
### Content Boosted Collaborative Filtering using Item Item Similarity and SUpervised learning

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
ratings = pd.read_csv('ratings.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
ratings.shape

(100004, 4)

In [6]:
movies = pd.read_csv('movies.csv')

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies.shape

(9125, 3)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
trainDF, tempDF = train_test_split(ratings, test_size=0.2, random_state=100)
testDF = tempDF.copy()
tempDF.rating = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [11]:
tempDF.head()

Unnamed: 0,userId,movieId,rating,timestamp
18935,126,95,,833286594
53160,384,55282,,1214773174
69828,481,61323,,1437001971
31345,225,367,,845565857
85744,575,1269,,1012595493


In [12]:
tempDF

Unnamed: 0,userId,movieId,rating,timestamp
18935,126,95,,833286594
53160,384,55282,,1214773174
69828,481,61323,,1437001971
31345,225,367,,845565857
85744,575,1269,,1012595493
74526,518,2119,,945367279
87873,585,940,,975365053
1406,15,2108,,1058250482
38887,285,407,,965091466
64228,461,783,,1091049691


In [13]:
testDF = testDF.dropna()
testDF.head()

Unnamed: 0,userId,movieId,rating,timestamp
18935,126,95,3.0,833286594
53160,384,55282,4.0,1214773174
69828,481,61323,4.0,1437001971
31345,225,367,5.0,845565857
85744,575,1269,4.0,1012595493


In [14]:
ratings = pd.concat([trainDF, tempDF]).reset_index()

In [15]:
ratings

Unnamed: 0,index,userId,movieId,rating,timestamp
0,15656,102,1717,4.0,957980283
1,63605,457,64839,4.5,1471384474
2,73657,514,880,1.0,853893138
3,95366,626,1983,4.0,974778174
4,11767,73,113573,3.0,1457597407
5,79509,547,4557,1.0,1021196447
6,27453,200,1088,1.0,1438020434
7,39071,285,1968,3.0,965088034
8,90731,603,2,4.0,868355874
9,31121,222,900,5.0,960919866


In [16]:
ratings.head()

Unnamed: 0,index,userId,movieId,rating,timestamp
0,15656,102,1717,4.0,957980283
1,63605,457,64839,4.5,1471384474
2,73657,514,880,1.0,853893138
3,95366,626,1983,4.0,974778174
4,11767,73,113573,3.0,1457597407


In [17]:
ratings.shape

(100004, 5)

## Matrix Factorization via Singular Value Decomposition

In [18]:
# We want the format of ratings matrix to be one row per user and one column per movie. 
#we can pivot ratings_df to get that and call the new variable R_df.
R_df = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
R_df.head

<bound method NDFrame.head of movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0     4.0     0.0     0.0     0.0     0.0     0.0   
6           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
7           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
8           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
9           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
10          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
11          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.

In [19]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_df, k = 50)

In [20]:
#diag
sigma = np.diag(sigma)

In [21]:
#I also need to add the user means back to get the predicted 5-star ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [22]:
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.122993,0.038639,0.037181,-0.003138,-0.006465,0.017602,0.039557,0.003053,0.012745,0.009584,...,-0.000184,-0.00203,0.019078,-0.001043,-0.001565,-0.008588,0.028617,0.0,0.0,-0.000367
1,0.76047,1.181292,-0.126992,0.026546,0.212633,0.14626,-0.015018,0.061514,-0.08065,2.020296,...,0.004404,0.00175,0.006564,-0.004064,-0.006096,-0.06343,0.009845,0.0,0.0,0.008807
2,0.623665,0.13124,0.009892,0.010293,0.179465,0.036324,0.036505,0.063805,-0.03109,0.001071,...,0.000846,-0.00239,0.011772,0.000766,0.001148,-0.003026,0.017658,0.0,0.0,0.001692
3,0.763973,1.041922,-0.540214,-0.009712,0.000124,-1.131016,-0.4511,-0.11328,-0.245913,1.882466,...,-0.003343,0.002289,0.048446,0.022485,0.033728,0.121327,0.072668,0.0,0.0,-0.006685
4,1.67938,1.252049,0.596602,0.027421,0.63547,0.391281,0.175542,0.02454,-0.052979,0.508043,...,0.007924,-0.003023,0.001158,-0.000388,-0.000581,0.025664,0.001737,0.0,0.0,0.015847


In [23]:
sigma

array([[ 63.25113225,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,  63.82679527,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,  64.22443521, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ..., 168.74567012,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
        198.81124125,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        , 415.58850449]])

In [24]:
# return the movies with the highest predicted rating that the specified user hasn’t already rated
#Take specific user row from matrix from predictions
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    #Added title and genres
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations, sorted_user_predictions, user_data, user_full

already_rated, predictions, sorted_user_predictions, user_data, user_full = recommend_movies(preds_df, 5, movies, ratings, 10)

User 5 has already rated 100 movies.
Recommending the highest 10 predicted ratings movies not already rated.


In [25]:
already_rated.head()

Unnamed: 0,index,userId,movieId,rating,timestamp,title,genres
38,379,5,1380,5.0,1163373044,Grease (1978),Comedy|Musical|Romance
31,368,5,597,5.0,1163373711,Pretty Woman (1990),Comedy|Romance
1,392,5,2081,5.0,1163373109,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance
13,443,5,33166,5.0,1163374211,Crash (2004),Crime|Drama
47,438,5,8636,4.5,1163373593,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX


In [26]:
predictions

Unnamed: 0,movieId,title,genres
1221,1580,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi
31,34,Babe (1995),Children|Drama
5537,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4340,5989,Catch Me If You Can (2002),Crime|Drama
1861,2396,Shakespeare in Love (1998),Comedy|Drama|Romance
5308,8360,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Ro...
3316,4246,Bridget Jones's Diary (2001),Comedy|Drama|Romance
469,539,Sleepless in Seattle (1993),Comedy|Drama|Romance
414,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller


In [27]:
user_data

Unnamed: 0,index,userId,movieId,rating,timestamp
1288,376,5,1221,2.5,1163374239
2296,392,5,2081,5.0,1163373109
5690,373,5,1022,4.0,1163373316
5898,423,5,5349,4.5,1163373606
10938,396,5,2424,4.0,1163373193
11077,364,5,500,4.5,1163373718
12437,369,5,788,3.5,1163374993
13135,445,5,34162,4.5,1163374227
14473,353,5,104,4.0,1163374639
14809,441,5,30749,4.5,1163374702


In [28]:
user_full

Unnamed: 0,index,userId,movieId,rating,timestamp,title,genres
38,379,5,1380,5.0,1163373044,Grease (1978),Comedy|Musical|Romance
31,368,5,597,5.0,1163373711,Pretty Woman (1990),Comedy|Romance
1,392,5,2081,5.0,1163373109,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance
13,443,5,33166,5.0,1163374211,Crash (2004),Crime|Drama
47,438,5,8636,4.5,1163373593,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
64,386,5,1784,4.5,1163374251,As Good as It Gets (1997),Comedy|Drama|Romance
66,387,5,1923,4.5,1163373726,There's Something About Mary (1998),Comedy|Romance
46,399,5,2694,4.5,1163373293,Big Daddy (1999),Comedy
17,450,5,48385,4.5,1163374357,Borat: Cultural Learnings of America for Make ...,Comedy
24,409,5,3897,4.5,1163374235,Almost Famous (2000),Drama


In [29]:
sorted_user_predictions

movieId
364       3.196379
1721      2.397927
356       2.378139
6377      2.274791
500       2.271439
1580      2.190969
2762      2.188013
586       2.148766
588       2.113278
2355      2.036110
595       1.955655
3897      1.943147
4306      1.934364
4022      1.889624
1923      1.871909
2706      1.795730
34        1.781223
2081      1.773039
8961      1.749024
858       1.748840
3114      1.733657
1         1.679380
5989      1.673257
4995      1.660613
2396      1.628298
597       1.615787
39        1.614191
8360      1.605806
4246      1.561705
1968      1.495308
            ...   
300      -0.365396
29       -0.378255
2021     -0.381467
750      -0.382454
1272     -0.382571
1320     -0.384824
91500    -0.386946
89745    -0.390620
112852   -0.398742
1198     -0.403069
79132    -0.403191
1235     -0.407573
122882   -0.414144
109487   -0.423959
534      -0.440795
96079    -0.446569
2019     -0.451117
1748     -0.454854
58559    -0.463813
99114    -0.478195
1095     -0.480608
2571