In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
RANDOM_STATE = 13

## Load Data

In [3]:
data_df = pd.read_csv("../data/ml-latest-small/ratings.csv")

In [4]:
data_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
data_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [6]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


## Split Training And Test Set

In [7]:
test_df = data_df.sample(frac = 0.2, random_state=RANDOM_STATE)
train_df = data_df.drop(test_df.index)
print(train_df.shape, test_df.shape)

(80669, 4) (20167, 4)


In [8]:
test_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
48260,313,293,4.0,1030555572
80555,509,6888,2.5,1435994604
51069,330,1387,4.0,1285905302
83115,526,308,3.5,1502132664
49813,318,134214,4.0,1451995959


## Matrix Factorization
### Algorithm and Regularization
Here I perform stochastic gradien descent to do matrix factorization. Your algorithm should consider case that there are new users and movies adding to the dataset you used to train. In other words, the dimension your matrix R, q, p is dynamic.

* For algorithms, the referenced paper are:

    * A1. **Stochastic Gradient Descent Section(P1%20Recommender-Systems.pdf)**: Learning Algorithms-Stochastic Gradient Descent

    * A2. Gradient Descent with Probabilistic Assumptions Section 2

    * A3. Alternating Least Squares Section 3.1
        * For regularizations, the referenced paper are:

* Regularization
    * R1. Penalty of Magnitudes Section: a Basic Matrix Factorization Model

    * R2. Bias and Intercepts Section: Adding Biases

    * R3. Temporal Dynamics Section 4
    
> Our Group(Sec1 Group4) only need to implement
> 1. A1+R3+P2
> 2. A1+R3+P3

In [9]:
def single_error(target_row, predicted_df):
    """
    Calculate the squared error for a specifc rating, compared with the prediction
    This function is used by 'apply' method, don't call it yourself
    
    """
    userId = int(target_row['userId'])
    movieId = int(target_row['movieId'])
    return (target_row['rating'] - predicted_df.loc[userId, movieId]) **2

def stochasticGD(data, train, test, max_iter, f_dim = 10, learning_rate = 0.001, 
                 regularization = 0.3, stopping_threshold = 0.001, random_state = RANDOM_STATE):
    """
    :param train:
    :param test:
    :param max_ter:
    :param f_dim:
    :param learning_rate:
    :param regularization:
    :param stopping_threshold:
    
    :return:
    """
    # set random state
    users = data['userId'].unique()
    movies = data['movieId'].unique()
    
    # create factorized matrix
    np.random.seed(random_state)
    # intialize to rand(-1,1)
    users_df = pd.DataFrame(np.random.rand(len(users), f_dim) * 2 - 1, index=users)
    movies_df = pd.DataFrame(np.random.rand(len(movies),f_dim) * 2 - 1, index=movies)
    
    # record training RMSE
    train_rmse = []
    test_rmse = []
    
    # stochastic gradient descent
    for iteration in range(max_iter):
        # shuffle the index for this stochastic iteration
        sample_indices = np.array(train_df.index)
        np.random.shuffle(sample_indices)
        
        for sample_i in sample_indices:
            sample = train_df.loc[sample_i,:]
            # calculate loss
            r_ui = sample['rating']
            user = users_df.loc[int(sample['userId']), :].copy()
            movie = movies_df.loc[int(sample['movieId']), :].copy()
            e_ui = r_ui - np.dot( user, movie)    
            # update vector
            gradient_movie = e_ui * user - regularization * movie
            if np.any(np.abs(gradient_movie) > stopping_threshold):
                movies_df.loc[int(sample['movieId']), :] = movie + learning_rate * gradient_movie

            gradient_user = e_ui * movie - regularization  * user
            if np.any(np.abs(gradient_user) > stopping_threshold):
                users_df.loc[int(sample['userId']), :] = user + learning_rate * gradient_user
        
        # make prediction. shape is (users * movies)
        predicted_df = pd.DataFrame(np.dot(users_df, movies_df.T), columns= movies_df.index, index = users_df.index)

        # calculating rmse
        if iteration % 10 == 0:
            train_single_rmse = np.sqrt(np.mean(train.apply(single_error, axis = 1, predicted_df = predicted_df)))
            train_rmse.append(train_single_rmse)
            test_single_rmse = np.sqrt(np.mean(test.apply(single_error, axis = 1, predicted_df = predicted_df)))
            test_rmse.append(test_single_rmse)
            print(f"Epoch: {iteration}")
            print(f"Training RMSE: {train_single_rmse}\t Test RMSE: {test_single_rmse}",)
            
    return users_df, movies_df, train_rmse, test_rmse

In [10]:
users_df, movies_df, train_rmse, test_rmse = stochasticGD(data_df, train_df, test_df, 1)

Epoch: 0
Training RMSE: 3.7232815485207023	 Test RMSE: 3.741565397631318


In [11]:
predicted_df = pd.DataFrame(np.dot(users_df, movies_df.T), columns= movies_df.index, index = users_df.index)
predicted_df.head(5)

Unnamed: 0,1,3,6,47,50,70,101,110,151,157,...,147662,148166,149011,152372,158721,160341,160527,160836,163937,163981
1,0.420647,0.666458,-0.572329,-0.042071,1.906138,-0.078009,-0.821778,0.009163,0.991631,-1.415281,...,0.308248,-0.630935,-0.480102,-0.53081,0.1706,0.59171,0.668683,-0.499309,1.045802,-1.666364
2,-1.729266,-1.348801,-0.545158,0.065139,-1.013823,0.291876,1.967949,0.256932,0.872188,0.668263,...,0.43366,0.863355,0.924489,0.157198,-1.705613,-1.557026,0.896567,0.002667,-0.349469,0.563209
3,0.436003,-0.308128,-1.622448,0.402658,0.22347,2.055105,1.151883,-1.357072,-0.125645,-1.424658,...,1.716859,-0.362625,0.150071,-1.955613,1.56759,0.824677,0.114928,-0.647488,3.061716,-0.431924
4,0.613158,0.426844,1.082139,0.422054,0.291269,-0.197889,-0.220295,-0.452328,-0.85137,1.195279,...,-0.428659,1.655227,0.803953,-0.991743,-0.393152,0.720676,0.087631,-1.385409,-0.944979,0.282721
5,-0.38732,-0.603773,0.037303,-0.360848,0.412845,-1.84177,0.122034,0.912422,-0.426932,-0.563312,...,-0.522372,-1.220036,0.251329,0.826388,0.566947,-0.454132,-0.331613,0.131828,0.057954,0.318165
