In [1]:
import numpy as np
import pandas as pd

In [2]:
import pickle
with open('data.pickle', 'rb') as f:
    base_params = pickle.load(f)
# base_mui, base_bu, base_bi = best_set

In [3]:
#Loading dataset
df_train_full = pd.read_csv("./dataset/train.csv")
df_test = pd.read_csv("./dataset/test.csv")
df_submission = pd.read_csv("./dataset/dummy_submission.csv")

In [4]:
#Splitting into test and train data
np.random.seed(5)
indices = np.random.permutation(df_train_full.shape[0])
training_idx, test_idx = indices[:int(0.85*len(df_train_full))], indices[int(0.85*len(df_train_full)):]
df_train, df_csv = df_train_full.iloc[training_idx,:], df_train_full.iloc[test_idx,:]
#Sorting for better data handling and clarity
df_train=df_train.sort_values(by=['userId','movieId']).reset_index(drop=True)
df_csv=df_csv.sort_values(by=['userId','movieId']).reset_index(drop=True)

In [5]:
#Baseline Prediction


def minibatch(batch_no,df,batch_size):
    '''Generates the batch for mini-batch gradient descent '''
    if (batch_no+1)*batch_size > len(df):
        minibatch_indices = indices[batch_size*(batch_no)::]
    else:        
        minibatch_indices = indices[batch_size*(batch_no):batch_size*(batch_no+1)]
    X_train = df.iloc[minibatch_indices,:]
    X_train = X_train.sort_values(by=['userId','movieId']).reset_index(drop=True)
    return X_train

def loss(parameters,df):
    miu,bu,bi,pu,qi = parameters
    loss = 0
    for i in range(len(df)):
        userid =  df.iloc[i,0]
        movieid = df.iloc[i,1]
        rating = miu+bu[userid]+bi[movieid]+np.dot(pu[userid],qi[movieid])
        if rating>5:
            rating = 5
        if rating<0.5:
            rating =0.5
        loss = loss + (df.iloc[i,2]-(miu+bu[userid]+bi[movieid]+np.dot(pu[userid],qi[movieid])))**2
    loss = loss/len(df)
    return loss    

In [6]:
def Latent_Training(step_size,dim,Epochs,batch_size,df,base_params):
    base_mui, base_bu, base_bi = base_params
    #Initialize values for bu,b'i,miu
    #Since we are given there are 10k users and 10k movie id
    pu = np.ones((10000,dim)) / dim
    qi = np.ones((10000,dim)) / dim
    #Taking miu from baseline
    miu = base_mui
    bu = base_bu
    bi = base_bi
    
    num_batches = df.shape[0]//batch_size + 1
    for epoch in range(Epochs):
        for batch in range(num_batches):
            bui = []
            X = minibatch(batch,df,batch_size)
            userid =  X.iloc[:,0]
            movieid = X.iloc[:,1]
            bui.append((miu+bu[userid]+bi[movieid]))
            bui = np.ravel(np.array(bui).T)
            pu_temp = pu
            qi_temp = qi
            for i in range(len(X)):
                userid =  X.iloc[i,0]
                movieid = X.iloc[i,1]
                prod = np.dot(pu[userid],qi[movieid])
                pu_temp[userid]  += step_size*qi[movieid]*(X.iloc[i,2]-bui[i]-prod)
                qi_temp[movieid] += step_size*pu[userid]*(X.iloc[i,2]-bui[i]-prod)
            pu = pu_temp
            qi = qi_temp
            
            if batch %500 ==0:
#                 print(pu,qi)
                print("Epoch:{} Batch:{} ------ Training Error:{}".format(epoch,batch,loss((miu,bu,bi,pu,qi),X)))
    return pu, qi

In [7]:
np.random.seed(5)
indices = np.random.permutation(df_train.shape[0])

#Hyperparameters
step_size = 0.01
dim = 20
Epochs = 10
batch_size = 1024  

#Regularization parameter tuning
pu, qi = Latent_Training(step_size,dim,Epochs,batch_size,df_train, base_params)
best_pq = (pu,qi)

Epoch:0 Batch:0 ------ Training Error:0.7388040818117134
Epoch:0 Batch:500 ------ Training Error:0.7517068733107805
Epoch:0 Batch:1000 ------ Training Error:0.6785573057103371
Epoch:0 Batch:1500 ------ Training Error:0.6906565619176424
Epoch:0 Batch:2000 ------ Training Error:0.7120948669959121
Epoch:0 Batch:2500 ------ Training Error:0.7168174056862945
Epoch:0 Batch:3000 ------ Training Error:0.6831573671906152
Epoch:0 Batch:3500 ------ Training Error:0.7411393248829192
Epoch:0 Batch:4000 ------ Training Error:0.6393509060116275
Epoch:1 Batch:0 ------ Training Error:0.7370463905934963
Epoch:1 Batch:500 ------ Training Error:0.749567433969079
Epoch:1 Batch:1000 ------ Training Error:0.67611150406611
Epoch:1 Batch:1500 ------ Training Error:0.6877412078322739
Epoch:1 Batch:2000 ------ Training Error:0.7097882761978673
Epoch:1 Batch:2500 ------ Training Error:0.7141817658693204
Epoch:1 Batch:3000 ------ Training Error:0.682735260819086
Epoch:1 Batch:3500 ------ Training Error:0.740733867

In [8]:
import pickle
with open('data_pq.pickle', 'wb') as f:
    pickle.dump(best_pq, f)

In [9]:
base_mui, base_bu, base_bi = base_params
best_params = (base_mui, base_bu, base_bi, pu, qi)
train_loss = loss(best_params,df_train)
val_loss = loss(best_params,df_csv) 
print(train_loss, val_loss)

0.6508727320818251 0.6601834151059541


In [12]:
#Submission
predictions = np.zeros(len(df_test))
for i in range(len(df_test)):
    userid =  df_test.iloc[i,0]
    movieid = df_test.iloc[i,1]
    rating = float("{0:.1f}".format(base_mui+base_bu[userid]+base_bi[movieid]+np.dot(pu[userid],qi[movieid])))

    if rating>5:
        rating = 5
    if rating<0.5:
        rating =0.5
    predictions[i] = rating
df_submission.Prediction = predictions
df_submission.to_csv('./Submission.csv',index=False)

In [10]:
# baseline_train_loss = 0.7006116025819583 
# baseline_val_loss = 0.7062281886121021