# SGD

In [1]:
import os
import warnings
import numpy as np
import numpy.linalg as npla
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
%matplotlib inline

In [41]:
class mfsgd(object):
    def __init__(self, filename, n=10, penalty=0.5, learning_rate=0.01, stopping_criteria=0.01):
        """
        param learning_rate: minimum 1e-6 
        """
        self.data = mfsgd.preprocess(filename)
        self.lr = max(learning_rate, 1e-6)
        self.origlr = max(learning_rate, 1e-6)
        self.decrement = 1
        self.nepoch = 1e6
        self.sc = stopping_criteria
        self.n = n
        self.penalty = penalty
        self.train_size = None
        self.validation_size = 0
        self.n_users = len(self.data.loc[:, 'userId'].unique())
        unique_items = self.data.loc[:, 'movieId'].unique()
        self.n_items = len(unique_items)
        self.item_mapping = dict(zip(unique_items, list(range(len(unique_items)))))
        
    def setLearningRateSchedule(self, start=0.01, decrement=0.1, nepoch=100):
        """
        param start: starting learning rate
        param decrement: multiplier to the learning rate per nepoch epochs
        param nepoch: number of epochs between two decrements
        """
        self.lr = start
        self.origlr = start
        self.decrement = decrement
        self.nepoch = nepoch
        return self
    
    def fit(self, train_size=0.7, user_nbins=10, item_nbins=3, beta=0.4, n_init=1, n_iter=50):
        if train_size > 1:
            raise Exception('train_size cannot exceed 1')
        self.r = self.data.groupby('userId').apply(lambda x: x.head(int(len(x)*min(1,train_size)))).reset_index(level=0, drop=True)
        self.train_size = train_size
        self.beta = beta
        self.time_window = (self.r.loc[:, 'timestamp'].min(), self.r.loc[:, 'timestamp'].max()+1)
        # FIXME, user_nbins can be a dictionary for each user
        self.user_nbins = user_nbins
        self.user_binsize = self.__binify(self.time_window, self.user_nbins)
        self.avg_user_bin = {k: self.__timestampToBin(v, self.user_binsize) for k, v in self.r.groupby('userId')['timestamp'].mean().items()}
        self.item_nbins = item_nbins
        self.item_binsize = self.__binify(self.time_window, self.item_nbins)
        #self.ru = self.r.groupby('userId').count()
        
        self.train_loss = np.nan
        for i in range(n_init):
            result = self.__trainEach(n_iter)
            if np.isnan(self.train_loss) or result['loss'] < self.train_loss:
                self.mu = result['mu']
                self.q = result['q']
                self.p_user = result['p_user']
                self.pa_user = result['pa_user']
                self.b_user = result['b_user']
                self.a_user = result['a_user']
                self.b_item = result['b_item']
                self.b_item_bin = result['b_item_bin']
                self.train_loss = result['loss']
        self.__resetLR()
        return self
    
    def validate(self, validation_size=0.1):
        if self.train_size is None:
            raise Exception('model is not trained')
        if self.train_size == 1:
            warnings.warn('train_size = 1, no data can be used to validate')
            return
        if validation_size + self.train_size > 1:
            warnings.warn('validation_size + train_size cannot exceed 1, truncating validation_size to ', 1-self.train_size)
            validation_size = 1 - self.train_size
        if validation_size == 0:
            warnings.warn('validation_size = 0')
            return
        
        self.validation_size = validation_size
        pct = self.validation_size / (1 - self.train_size)
        self.validation = self.data.drop(self.r.index).groupby('userId').apply(lambda x: x.head(int(len(x)*min(1,pct)))).reset_index(level=0, drop=True)
        rmse, r_pred = self.__computeLoss(dataset='validation')
        print('validation rmse:', rmse)
        return r_pred
    
    def predict(self):
        if self.train_size is None:
            raise Exception('model is not trained')
        if self.train_size + self.validation_size >= 1:
            warnings.warn('no data can be used to test')
            return
        self.test = self.data.drop(self.r.index) if self.validation_size == 0 else self.data.drop(self.r.index.union(self.validation.index))
        rmse, r_pred = self.__computeLoss(dataset='test')
        print('test rmse:', rmse)
        return r_pred
        
    def __trainEach(self, n_iter):
        mu = np.random.uniform(-1, 1, 1)
        q = np.random.uniform(-1, 1, (self.n, self.n_items))
        p_user = np.random.uniform(-1, 1, (self.n, self.n_users))
        pa_user = np.random.uniform(-1, 1, (self.n, self.n_users))
        b_user = np.random.uniform(-1, 1, self.n_users)
        a_user = np.random.uniform(-1, 1, self.n_users)
        b_item = np.random.uniform(-1, 1, self.n_items)
        b_item_bin = np.random.uniform(-1, 1, (self.item_nbins, self.n_items))
        
        # FIXME, add Ru**(-1/2), y
        
        c = 0
        for it in range(n_iter):
            loss = 0
            for ind, s in self.r.iterrows():
                u, i, r, t = int(s['userId'])-1, self.item_mapping[int(s['movieId'])], s['rating'], s['timestamp']
                pu, pua, qi = p_user[:, u], pa_user[:, u], q[:, i]
                i_bin = self.__timestampToBin(t, self.item_binsize)
                bi, bibin = b_item[i], b_item_bin[i_bin, i]
                bu, au = b_user[u], a_user[u]
                dev = self.__dev(self.__timestampToBin(t, self.user_binsize), self.avg_user_bin[u+1], self.beta)
                r_hat = mu+bi+bibin+bu+au*dev+qi@(pu+pua*dev)
                res = r - r_hat
                # update based on gradient
                mu -= self.lr * self.__muDeriv(res)
                q[:,i] -= self.lr * self.__qDeriv(res, pu, pua, qi, dev)
                p_user[:,u] -= self.lr * self.__puDeriv(res, pu, qi)
                pa_user[:, u] -= self.lr * self.__puaDeriv(res, pua, qi, dev)
                b_user[u] -= self.lr * self.__buDeriv(res, bu)
                a_user[u] -= self.lr *self.__auDeriv(res, au, dev)
                b_item[i] -= self.lr * self.__biDeriv(res, bi)
                b_item_bin[i_bin, i] -= self.lr * self.__bibinDeriv(res, bibin)
                
                loss += res**2
            # update learning rate
            c += 1
            if not c%self.nepoch:
                self.lr = max(self.lr * self.decrement, 1e-6)
            
            # use avg residual as loss
            loss = np.sqrt(loss / len(self.r))
            # dont compute loss again as complexity is high
            # loss, _ = self.__computeLoss(mu=mu, q=q, p_user=p_user, pa_user=pa_user, 
            #                              b_user=b_user, a_user=a_user, b_item=b_item, b_item_bin=b_item_bin)
            
            print('epoch', it+1, '----learning rate: {:.6f}'.format(self.lr), '----unpenalized training loss:', loss)
        
        return {'loss':loss,
                'mu':mu,
                'q':q,
                'p_user':p_user,
                'pa_user':pa_user,
                'b_user':b_user,
                'a_user':a_user,
                'b_item':b_item,
                'b_item_bin':b_item_bin}
        
    def __computeLoss(self, dataset='train', **kwargs):
        loss = 0
        r_pred = None
        if dataset == 'train':
            data = self.r
            mu, q, p_user, pa_user, b_user, a_user, b_item, b_item_bin = kwargs['mu'], kwargs['q'], kwargs['p_user'], kwargs['pa_user'], kwargs['b_user'], kwargs['a_user'], kwargs['b_item'], kwargs['b_item_bin']
        elif dataset in ['test', 'validation']:
            data = self.test if dataset == 'test' else self.validation
            r_pred = np.zeros(len(data))
            mu, q, p_user, pa_user, b_user, a_user, b_item, b_item_bin = self.mu, self.q, self.p_user, self.pa_user, self.b_user, self.a_user, self.b_item, self.b_item_bin
        else:
            raise Exception('ambiguous compute loss inputs')
        
        for ind, s in data.reset_index().iterrows():
            u, i, r, t = int(s['userId'])-1, self.item_mapping[int(s['movieId'])], s['rating'], s['timestamp']
            pu, pua, qi = p_user[:, u], pa_user[:, u], q[:, i]
            bi, bibin = b_item[i], b_item_bin[self.__timestampToBin(t, self.item_binsize), i]
            bu, au = b_user[u], a_user[u]
            dev = self.__dev(self.__timestampToBin(t, self.user_binsize), self.avg_user_bin[u+1], self.beta)
            r_hat = mu+bi+bibin+bu+au*dev+qi@(pu+pua*dev)
            res = (r-r_hat)**2
            if dataset == 'train':
                loss += res + self.penalty*(bi**2+bibin**2+bu**2+au**2+npla.norm(pu)**2+npla.norm(pua)**2+npla.norm(qi)**2)
            else:
                loss += res
                r_pred[ind] = r_hat

        return np.sqrt(loss / len(data)), r_pred
    
    # FIXME, update qDeriv
    def __muDeriv(self, res):
        return -res
    
    def __qDeriv(self, res, pu, pua, qi, dev):
        return -res * (pu+pua*dev) + self.penalty * qi
    
    def __puDeriv(self, res, pu, qi):
        return -res * qi + self.penalty * pu
    
    def __puaDeriv(self, res, pua, qi, dev):
        return -res * qi * dev + self.penalty * pua
    
    def __buDeriv(self, res, bu):
        return -res + self.penalty * bu
    
    def __auDeriv(self, res, au, dev):
        return -res * dev + self.penalty * au
    
    def __biDeriv(self, res, bi):
        return -res + self.penalty * bi
    
    def __bibinDeriv(self, res, bibin):
        return -res + self.penalty * bibin
    
    # FIXME, add y and R(u)^(-1/2)
    def __dev(self, t, avg, b):
        return np.sign(t-avg) * np.abs(t-avg)**b
    
    def __binify(self, window, nbins):
        return (window[1] - window[0]) / nbins
    
    def __timestampToBin(self, t, binsize):
        return int((t - self.time_window[0]) // binsize)
    
    def __resetLR(self):
        self.lr = self.origlr
        return
    
    @staticmethod
    def preprocess(filename):
        data = pd.read_csv(filename)
        return data

In [24]:
f = os.path.join('G:\mawenwen\Columbia\Fall 2019\Applied Data Science\proj4','fall2019-project4-sec1-grp4-master\data\ml-latest-small','ratings.csv')

In [42]:
s = mfsgd(filename=f, n=20, penalty=1.5) # learning rate should not be > 0.1 as it results in overflow in loss calculation
s.setLearningRateSchedule(start=0.05, decrement=0.1, nepoch=12)

<__main__.mfsgd at 0x24d13e02358>

In [43]:
s.fit(train_size=0.8, user_nbins=8, item_nbins=3, beta=0.3, n_iter=30)

epoch 1 ----learning rate: 0.050000 ----unpenalized training loss: [1.01370978]
epoch 2 ----learning rate: 0.050000 ----unpenalized training loss: [0.89587431]
epoch 3 ----learning rate: 0.050000 ----unpenalized training loss: [0.87254245]
epoch 4 ----learning rate: 0.050000 ----unpenalized training loss: [0.86095344]
epoch 5 ----learning rate: 0.050000 ----unpenalized training loss: [0.85403175]
epoch 6 ----learning rate: 0.050000 ----unpenalized training loss: [0.84950416]
epoch 7 ----learning rate: 0.050000 ----unpenalized training loss: [0.8463757]
epoch 8 ----learning rate: 0.050000 ----unpenalized training loss: [0.84413241]
epoch 9 ----learning rate: 0.050000 ----unpenalized training loss: [0.84247949]
epoch 10 ----learning rate: 0.050000 ----unpenalized training loss: [0.84123547]
epoch 11 ----learning rate: 0.050000 ----unpenalized training loss: [0.84028288]
epoch 12 ----learning rate: 0.005000 ----unpenalized training loss: [0.83954274]
epoch 13 ----learning rate: 0.005000 -

<__main__.mfsgd at 0x24d13e02358>

In [44]:
r_validate = s.validate(validation_size=0.1) # return predicted ratings

validation rmse: [0.98889374]


In [45]:
r_test = s.predict() # return predicted ratings

test rmse: [1.05286189]
