In [1]:
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm
from collections import defaultdict


In [2]:
#importing data stored in the database to pandas dataframe

database_name = 'RecommendationDatabase.db'
conn = sqlite3.connect(database_name)

# Define SQL query to select all data from the table
sql_query_ratings = "SELECT * FROM Ratings;"
sql_query_movies = "SELECT * FROM Movies"

# Read data from SQLite database into a pandas DataFrame
reviews = pd.read_sql_query(sql_query_ratings, conn)
movies = pd.read_sql_query(sql_query_movies, conn)


conn.close()

In [3]:
reviews.head()

Unnamed: 0,user_id,movie_id,ratings,Time Stamp,Date
0,1,1074638,7,1365029107,2013-04-03 18:45:07
1,1,1853728,8,1366576639,2013-04-21 16:37:19
2,2,104257,8,1364690142,2013-03-30 20:35:42
3,2,1259521,8,1364118447,2013-03-24 05:47:27
4,2,1991245,7,1364117717,2013-03-24 05:35:17


In [5]:
# Create user-by-item matrix
user_items = reviews[['user_id', 'movie_id', 'ratings', 'Time Stamp']]
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['ratings'].max().unstack()

# Create data subset
user_movie_subset = [[  7,  10,   8,], [  6,  10,   7,], [  8,   9,   8,], [  8,  10,  10,],  [  9,   9,   9,],  [  8,   9,   9,]] #using dummy values
ratings_mat = np.array(user_movie_subset, dtype=float)
print(ratings_mat)

[[ 7. 10.  8.]
 [ 6. 10.  7.]
 [ 8.  9.  8.]
 [ 8. 10. 10.]
 [ 9.  9.  9.]
 [ 8.  9.  9.]]


In [6]:
def FunkSVD(ratings_mat, latent_features=4, learning_rate=0.0001, iters=100):
    '''  
    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    movie_mat - (numpy array) a latent feature by movie matrix
    '''
    
    n_users = ratings_mat.shape[0]
    n_movies = ratings_mat.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(ratings_mat))
    
    # initialize the user and movie matrices with random values
    user_mat = np.random.rand(n_users, latent_features) #creating matrices
    movie_mat = np.random.rand(latent_features, n_movies)
    
    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # keep track of iteration and MSE
    print("Optimizaiton Statistics")
    print("Iterations | Mean Squared Error ")
    

    for iteration in range(iters):

        # update our sse
        old_sse = sse_accum
        sse_accum = 0
        
        # For each user-movie pair
        for i in range(n_users):
            for j in range(n_movies):
                
                # if the rating exists
                if ratings_mat[i, j] > 0:
                    
                    # compute the error as the actual minus the dot product of the user and movie latent features
                    diff = ratings_mat[i, j] - np.dot(user_mat[i, :], movie_mat[:, j])
                    
                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2
                    
                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*movie_mat[k, j])
                        movie_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])

        # print results
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))
        
    return user_mat, movie_mat 

In [37]:
user_mat, movie_mat = FunkSVD(ratings_mat, 
                              latent_features=3, 
                              learning_rate=0.005, 
                              iters=10)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 58.316298
2 		 45.423023
3 		 29.489402
4 		 15.032580
5 		 6.056105
6 		 2.221722
7 		 0.992489
8 		 0.649763
9 		 0.550397
10 		 0.514935


In [38]:
print('These are predicted values:')
print(np.dot(user_mat, movie_mat)) 
print('\n\n These are actual values')
print(ratings_mat)

These are predicted values:
[[7.5445277  9.22418298 8.44648678]
 [6.90341795 8.56793836 7.61312287]
 [7.33378427 9.23219445 8.3538405 ]
 [8.7441918  9.93064899 9.17385406]
 [8.2918516  9.71022864 8.9640128 ]
 [7.9534719  9.33773318 8.51376282]]


 These are actual values
[[ 7. 10.  8.]
 [ 6. 10.  7.]
 [ 8.  9.  8.]
 [ 8. 10. 10.]
 [ 9.  9.  9.]
 [ 8.  9.  9.]]


In [39]:
user_mat, movie_mat = FunkSVD(ratings_mat, 
                              latent_features=3, 
                              learning_rate=0.005, 
                              iters=300)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 59.862887
2 		 47.176607
3 		 31.232755
4 		 16.237214
5 		 6.560222
6 		 2.319897
7 		 0.970421
8 		 0.619514
9 		 0.533525
10 		 0.509526
11 		 0.499890
12 		 0.493925
13 		 0.489074
14 		 0.484611
15 		 0.480278
16 		 0.475960
17 		 0.471604
18 		 0.467180
19 		 0.462668
20 		 0.458057
21 		 0.453340
22 		 0.448508
23 		 0.443557
24 		 0.438481
25 		 0.433277
26 		 0.427942
27 		 0.422473
28 		 0.416867
29 		 0.411124
30 		 0.405243
31 		 0.399223
32 		 0.393066
33 		 0.386773
34 		 0.380347
35 		 0.373791
36 		 0.367109
37 		 0.360306
38 		 0.353388
39 		 0.346362
40 		 0.339236
41 		 0.332018
42 		 0.324718
43 		 0.317345
44 		 0.309912
45 		 0.302431
46 		 0.294912
47 		 0.287370
48 		 0.279819
49 		 0.272271
50 		 0.264742
51 		 0.257245
52 		 0.249796
53 		 0.242409
54 		 0.235098
55 		 0.227878
56 		 0.220762
57 		 0.213763
58 		 0.206894
59 		 0.200167
60 		 0.193592
61 		 0.187180
62 		 0.180941
63 		 0.174881
64 

In [40]:
print('These are predicted values:')
print(np.dot(user_mat, movie_mat)) 
print('\n\n These are actual values')
print(ratings_mat)

These are predicted values:
[[ 7.04869501 10.01881904  7.94770521]
 [ 5.98224582  9.9925347   7.01883657]
 [ 7.88831367  8.9566865   8.11937999]
 [ 7.99627135 10.0001415  10.00420877]
 [ 8.98034724  8.99281195  9.02192699]
 [ 8.08645186  9.03340262  8.90864539]]


 These are actual values
[[ 7. 10.  8.]
 [ 6. 10.  7.]
 [ 8.  9.  8.]
 [ 8. 10. 10.]
 [ 9.  9.  9.]
 [ 8.  9.  9.]]


### Putting in NAN value intentionally to find out if the model predicts the missing value correctly.

In [42]:
ratings_mat[0, 0] = np.nan
ratings_mat

array([[nan, 10.,  8.],
       [ 6., 10.,  7.],
       [ 8.,  9.,  8.],
       [ 8., 10., 10.],
       [ 9.,  9.,  9.],
       [ 8.,  9.,  9.]])

In [43]:
# run SVD on the matrix with the missing value
user_mat, movie_mat = FunkSVD(ratings_mat, 
                              latent_features=3, 
                              learning_rate=0.005, 
                              iters=450)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 58.225546
2 		 44.614486
3 		 28.407816
4 		 14.216852
5 		 5.651738
6 		 2.057806
7 		 0.922878
8 		 0.619225
9 		 0.540845
10 		 0.518235
11 		 0.509650
12 		 0.504961
13 		 0.501510
14 		 0.498489
15 		 0.495606
16 		 0.492744
17 		 0.489850
18 		 0.486899
19 		 0.483877
20 		 0.480777
21 		 0.477594
22 		 0.474322
23 		 0.470958
24 		 0.467499
25 		 0.463939
26 		 0.460276
27 		 0.456506
28 		 0.452625
29 		 0.448631
30 		 0.444521
31 		 0.440291
32 		 0.435940
33 		 0.431465
34 		 0.426864
35 		 0.422136
36 		 0.417280
37 		 0.412294
38 		 0.407180
39 		 0.401937
40 		 0.396565
41 		 0.391067
42 		 0.385444
43 		 0.379698
44 		 0.373832
45 		 0.367850
46 		 0.361757
47 		 0.355557
48 		 0.349256
49 		 0.342859
50 		 0.336375
51 		 0.329810
52 		 0.323173
53 		 0.316471
54 		 0.309715
55 		 0.302915
56 		 0.296079
57 		 0.289220
58 		 0.282348
59 		 0.275474
60 		 0.268610
61 		 0.261768
62 		 0.254958
63 		 0.248194
64 

In [46]:
print('These are predicted values:')
print(np.dot(user_mat, movie_mat)) 
print('\nWe know that the actual value is 7 and the predicted is close to it, 7.27.')


These are predicted values:
[[ 7.2790984  10.00098605  7.99896173]
 [ 6.00149476 10.00020175  6.99877417]
 [ 7.99319613  8.99726474  8.00729999]
 [ 7.99410936  9.99779305 10.00628666]
 [ 8.99727683  8.99895249  9.00307115]
 [ 8.01233466  9.00457647  8.98729806]]

We know that the actual value is 7 and the predicted is close to it, 7.27.


#### Now, instead of using dummy values (I created above) I'm using a subset of the actual dataset (the first 1000).

In [47]:
# Setting up a matrix of the first 1000 users with movie ratings
first_1000_users = np.matrix(user_by_movie.head(1000))

# perform funkSVD on the matrix of the top 1000 users
user_mat, movie_mat = FunkSVD(first_1000_users, 
                              latent_features=3, 
                              learning_rate=0.005, 
                              iters=500)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 36.759858
2 		 22.863059
3 		 16.410935
4 		 12.577720
5 		 9.966996
6 		 8.071030
7 		 6.642995
8 		 5.538883
9 		 4.667983
10 		 3.970121
11 		 3.404012
12 		 2.940505
13 		 2.558333
14 		 2.241486
15 		 1.977617
16 		 1.757040
17 		 1.572063
18 		 1.416502
19 		 1.285335
20 		 1.174433
21 		 1.080376
22 		 1.000317
23 		 0.931878
24 		 0.873080
25 		 0.822279
26 		 0.778117
27 		 0.739478
28 		 0.705447
29 		 0.675276
30 		 0.648355
31 		 0.624183
32 		 0.602350
33 		 0.582517
34 		 0.564402
35 		 0.547772
36 		 0.532432
37 		 0.518218
38 		 0.504993
39 		 0.492642
40 		 0.481070
41 		 0.470193
42 		 0.459942
43 		 0.450258
44 		 0.441090
45 		 0.432392
46 		 0.424126
47 		 0.416255
48 		 0.408749
49 		 0.401579
50 		 0.394720
51 		 0.388148
52 		 0.381843
53 		 0.375786
54 		 0.369960
55 		 0.364351
56 		 0.358944
57 		 0.353729
58 		 0.348694
59 		 0.343830
60 		 0.339128
61 		 0.334581
62 		 0.330181
63 		 0.325922
64 

In [9]:
# How many actual ratings exist in first_1000_users
num_ratings = np.count_nonzero(~np.isnan(first_1000_users))
print("The number of actual ratings in the first_1000_users is {}.".format(num_ratings))
print()

# How many ratings did we make for user-movie pairs that didn't have ratings
ratings_for_missing = first_1000_users.shape[0]*first_1000_users.shape[1] - num_ratings
print("The number of ratings made for user-movie pairs that didn't have ratings is {}".format(ratings_for_missing))

The number of actual ratings in the first_1000_users is 4503.

The number of ratings made for user-movie pairs that didn't have ratings is 7500497


#### Now, creating validating and trainign dataset to findout the performance of the model. 

In [10]:
def create_train_test(reviews, order_by, training_size, testing_size):
 
    reviews_new = reviews.sort_values(order_by)
    training_df = reviews_new.head(training_size)
    validation_df = reviews_new.iloc[training_size:training_size+testing_size]
    
    return training_df, validation_df

train_df, val_df = create_train_test(reviews, 'Date', 8000, 2000)

print(train_df.shape)

(8000, 5)


In [11]:
# Creating user-item matrix of the new training and testing dataset

train_user_item = train_df[['user_id', 'movie_id', 'ratings', 'Time Stamp']]
train_data_df = train_user_item.groupby(['user_id', 'movie_id'])['ratings'].max().unstack()
train_data_np = np.array(train_data_df)



In [28]:
# Fit FunkSVD with the specified hyper parameters to the training data
user_mat, movie_mat = FunkSVD(train_data_np, 
                              latent_features=15, 
                              learning_rate=0.005, 
                              iters=300)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 10.686129
2 		 5.988577
3 		 4.173013
4 		 3.117209
5 		 2.425768
6 		 1.938376
7 		 1.577909
8 		 1.302431
9 		 1.086813
10 		 0.914914
11 		 0.775854
12 		 0.662038
13 		 0.568004
14 		 0.489714
15 		 0.424105
16 		 0.368797
17 		 0.321917
18 		 0.281970
19 		 0.247761
20 		 0.218330
21 		 0.192902
22 		 0.170849
23 		 0.151659
24 		 0.134910
25 		 0.120253
26 		 0.107395
27 		 0.096091
28 		 0.086132
29 		 0.077341
30 		 0.069567
31 		 0.062680
32 		 0.056567
33 		 0.051133
34 		 0.046294
35 		 0.041976
36 		 0.038119
37 		 0.034665
38 		 0.031570
39 		 0.028790
40 		 0.026289
41 		 0.024036
42 		 0.022004
43 		 0.020168
44 		 0.018506
45 		 0.017000
46 		 0.015634
47 		 0.014392
48 		 0.013263
49 		 0.012234
50 		 0.011295
51 		 0.010438
52 		 0.009655
53 		 0.008938
54 		 0.008281
55 		 0.007678
56 		 0.007125
57 		 0.006616
58 		 0.006149
59 		 0.005718
60 		 0.005321
61 		 0.004955
62 		 0.004617
63 		 0.004305
64 		 

In [29]:
# this fucntion helps us to find the funkSVD prediction using the userID and movieID

def predict_rating(user_matrix, movie_matrix, user_id, movie_id):
   
    # Create series of users and movies in the right order
    user_ids_series = np.array(train_data_df.index)
    movie_ids_series = np.array(train_data_df.columns)
    
    # User row and Movie Column
    user_row = np.where(user_ids_series == user_id)[0][0]
    movie_col = np.where(movie_ids_series == movie_id)[0][0]
    
    # Take dot product of that row and column in U and V to make prediction
    pred = np.dot(user_matrix[user_row, :], movie_matrix[:, movie_col])
    
    return pred



### finding FunkSVD prediciton how would user 2625 would rate the movie 169547

In [32]:
#printing the results nicely
def print_prediction_summary(user_id, movie_id, prediction):
  
    movie_name = str(movies[movies['movie_id'] == movie_id]['movie']) [5:]
    movie_name = movie_name.replace('\nName: movie, dtype: object', '')
    print("For user {} we predict a {} rating for the movie {}.".format(user_id, round(prediction, 2), str(movie_name)))


In [38]:
def validation_comparison(val_df, num_preds):
    
    val_users = np.array(val_df['user_id'])
    val_movies = np.array(val_df['movie_id'])
    val_ratings = np.array(val_df['ratings'])
    
    
    for idx in range(num_preds):
        try:
            pred = predict_rating(user_mat, movie_mat, val_users[idx], val_movies[idx])
            print("The actual rating for user {} on movie {} is {}.\n While the predicted rating is {}."
                  .format(val_users[idx], val_movies[idx], val_ratings[idx], round(pred))) 
        except IndexError:
            continue
       
# Perform the predicted vs. actual for the first 10 rows.  How does it look?
validation_comparison(val_df, 8)

The actual rating for user 7876 on movie 1245112 is 7.
 While the predicted rating is 8.
The actual rating for user 7876 on movie 1649444 is 4.
 While the predicted rating is 5.
The actual rating for user 5544 on movie 2023587 is 6.
 While the predicted rating is 7.
The actual rating for user 3326 on movie 1351685 is 10.
 While the predicted rating is 7.
The actual rating for user 3260 on movie 1602620 is 6.
 While the predicted rating is 9.


*The function only returns five movies because the other movies have no ratings. Therefore, the function will skip the prediction when it is not able to make a prediction for the user-movie pairs.*