In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, mean_squared_error;
from sklearn.model_selection import train_test_split,ShuffleSplit,GridSearchCV;
from fastFM.mcmc import FMClassification, FMRegression;
from sklearn.preprocessing import OneHotEncoder;
from fastFM import als;
from sklearn.feature_extraction.text import CountVectorizer;
import gc;
import pickle;
import random;
import matplotlib.pyplot as plt;
import time;

Doumentation : http://ibayer.github.io/fastFM/

### Model A: Basic model with just userId and movieId

In [3]:
def encode(df):
    
    '''
    Input : train and test sets
    Output : One hot encoded datasets
    '''
    
    encoder = OneHotEncoder(handle_unknown='ignore').fit(df)
    #trainX = encoder.transform(trainX)
    #testX = encoder.transform(testX)
    
    return encoder

In [107]:
ratings = np.genfromtxt('./ml-1m/ratings.dat',delimiter="::")
ratings =  pd.DataFrame(ratings)

ratings.columns = ['userId','movieId','rating','timestamp']
ratings = ratings.drop('timestamp', axis=1)
y = ratings['rating'].values
X = ratings.drop('rating', axis=1)

#trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.20, random_state=1234)
#trainX, testX = encode(trainX,testX)

encoder = encode(X)

##fixed
#trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.10, random_state=1234)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)

cv = ShuffleSplit(n_splits=3, test_size= int(0.2 * trainX.shape[0]), random_state = 2018)


estimator = als.FMRegression()

params = {
'n_iter' : np.arange(10,100,30),
'rank' :  np.arange(2,12,4),
'l2_reg_w': np.logspace(-6, -1, 3),
'l2_reg_V' : np.logspace(-6, -1, 3)
}

###Gridsearch over parameters
regressor = GridSearchCV(estimator=estimator , cv=cv, param_grid=params)
regressor.fit(trainX, trainY)

###get RMSE
mean_squared_error(regressor.predict(testX),testY)**0.5

0.8771354587448218

### Model B : Adding user data, genre, year of movie

In [114]:
users = pd.read_csv('./ml-1m/users.dat', sep='::', names=['userId', 'gender', 'age', 'occupation', 'zip'], \
                    header=None)
users.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [175]:
gc.collect()

2082

In [146]:
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', names=['movieId', 'title', 'genres'], header=None)
movies.head()
movies['year'] = movies.title.apply(lambda x : x[-5:-1])

sparse_genres = pd.DataFrame(CountVectorizer().fit_transform(movies.genres\
                                .map(lambda x: x.replace('|', ' '))).todense())

movies = pd.concat([movies[['movieId']], sparse_genres], axis=1) 


ratings = np.genfromtxt('./ml-1m/ratings.dat',delimiter="::")
ratings =  pd.DataFrame(ratings)

ratings.columns = ['userId','movieId','rating','timestamp']
ratings = ratings.drop('timestamp', axis=1)

ratings = pd.merge(pd.merge(ratings, users, on='userId'), movies, on='movieId')


y = ratings['rating'].values
X = ratings.drop('rating', axis=1)

for feature in X.columns:
    _,X[feature] = np.unique(X[feature], return_inverse=True)

trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.20, random_state=1234)
encoder = encode(X)

##fixed
#trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.10, random_state=1234)
trainX = encoder.transform(trainX)
testX = encoder.transform(testX)



cv = ShuffleSplit(n_splits=3, test_size= int(0.2 * trainX.shape[0]), random_state = 2018)
estimator2 = als.FMRegression()

params = {
'n_iter' : np.arange(10,100,30),
'rank' :  np.arange(2,12,4),
'l2_reg_w': np.logspace(-6, -1, 3),
'l2_reg_V' : np.logspace(-6, -1, 3)
}

###Gridsearch over parameters
regressor2 = GridSearchCV(estimator=estimator2 , cv=cv, param_grid=params)
regressor2.fit(trainX, trainY)

###get RMSE
mean_squared_error(regressor2.predict(testX),testY)**0.5

0.8783405327095336

In [156]:
pickle.dump(regressor,open("./dump_regressor1","wb"))
pickle.dump(regressor2,open("./dump_regressor2","wb"))

In [158]:
regressor1 = pickle.load(open("./dump_regressor","rb"))

GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=2018, test_size=160033, train_size=None),
       error_score='raise',
       estimator=FMRegression(init_stdev=0.1, l2_reg=0, l2_reg_V=0.1, l2_reg_w=0.1, n_iter=100,
       random_state=123, rank=8),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_iter': array([10, 40, 70]), 'rank': array([ 2,  6, 10]), 'l2_reg_w': array([1.00000e-06, 3.16228e-04, 1.00000e-01]), 'l2_reg_V': array([1.00000e-06, 3.16228e-04, 1.00000e-01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

```
Factorization Machines have been described as state of the art for many recommendation systems. 

Yet, experience has shown these models to suffer from slowtraining and local minima. Use a large(ish) dataset and characterize where FMs are easy to fit and accurate and where they are not.

1. Start with models that have no side information, and are only user and item ratings.
Specifically, subsample datasets from small to large, and subsample users/items
from sparsely-populated to well-populated, and train and test FMs. Where do they
work the best? Where do they fail? Can you set good rules of thumbs for their
training and use?
2. Next use side information about users or items. Answer the same questions as
above.

```

### Regressor 1 : No side information

In [25]:
def load_data_simple(file_):
    
    '''
    Input 
    
        File_ : file name of the ratings file
    
    Output : test - train dfs
    '''

    ratings = np.genfromtxt(file_,delimiter="::")
    ratings =  pd.DataFrame(ratings)
    
    ratings.columns = ['userId','movieId','rating','timestamp']
    ratings = ratings.drop('timestamp', axis=1)
    
    y = ratings['rating'].values
    X = ratings.drop('rating', axis=1)
    
    encoder = encode(X)
    

    
    ##fixed
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.10, random_state=1234)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    
    return (X.userId.nunique(),X.movieId.nunique()),trainX, testX, trainY, testY

#### Subsample data from small to large

In [None]:
#can grid search do this directly?

#train vs test error as data increase? vs a benchmark model, with and without early stopping vs SVD
#time taken to run the model
#Will try all k, epoch hyperspace together 

##FM without early stopping ?



# For the best iteration - check for local optima

#local optima - by sgd vs gd and changing random state?






In [15]:
def subsample_user_item(fname, sample_by_="All", model_type = "simple"):
    
    '''
    
    Input : trainX, testX, trainY, testY : dfs from the load_Data_Simple function
    
    sample_by_ : 
        All   -   small to large
        Items -   less to many
        Users -   less to many 
    
    '''
    if model_type == "simple":
        cnt,trainX, testX, trainY, testY = load_data_simple(fname)  
    else:
        cnt,trainX, testX, trainY, testY = load_data_side_info(fname[0],fname[1],fname[2]) 
        
    if sample_by_ == "All":
        #cv = ShuffleSplit(n_splits=10, test_size= int(0.2 * trainX.shape[0]), random_state = 2018)
        
        model_run = {}
        #keep lambda at
        _l = trainX.shape[0]
        
        ###10% for cv
        setcv = set(random.sample(range(_l), int(_l * .10)))
        potentialtrain = set(range(_l)) - setcv
        
        for i in np.arange(.2,1.01,0.2):
            #local split 
            settrain = random.sample(potentialtrain, int(len(potentialtrain) * i))
            print("model_" + sample_by_ +"_"+ str(i))
            model_run["model_" + sample_by_ +"_"+ str(i)] = fm_model(setcv,settrain,trainX,testX,trainY,testY)
            gc.collect()
        #return statistics
        return model_run

    if sample_by_ == "Items":
        # number of items in the dataset : 
        nusers = cnt[0]
        nitems = cnt[1]
        ###**modify**
        
        model_run = {}
        #keep lambda at
        _l = trainX.shape[0]
        
        ###10% for cv
        setcv = set(random.sample(range(_l), int(_l * .10)))
        potentialtrain = set(range(_l)) - setcv
        
        
        for i in [1,5,25,125,625,np.inf]:    #i defines atmost # of movies/items to be sampled
            #local split 
            settrain = set()
            #adds additional 1 min per model
            for j in range(nusers,nusers+nitems): #loop through movie columns
            #list(potentialtrain)# too much time to slice
                train_indices = np.argwhere(trainX[:,j])[:,0] #get non-zero indices which are legal 
                
                cand_indices = potentialtrain.intersection(set(train_indices))
                settrain.update(random.sample(cand_indices, min(i,len(cand_indices))))
                
            
            print("CV size",len(setcv)/(len(settrain)+len(setcv)))
            
            print("model_" + sample_by_ +"_"+ str(i))
            model_run["model_" + sample_by_ +"_"+ str(i)] = fm_model(setcv,settrain,trainX,testX,trainY,testY)        
            gc.collect()
        
        return model_run
    
    
    if sample_by_ == "Users":
        # number of items in the dataset : 
        nusers = cnt[0]
        #nitems = cnt[1]
        ###**modify**
        
        model_run = {}
        #keep lambda at
        _l = trainX.shape[0]
        
        ###10% for cv
        setcv = set(random.sample(range(_l), int(_l * .10)))
        potentialtrain = set(range(_l)) - setcv
        
        
        for i in [1,2,4,16,32,np.inf]:    #i defines atmost # of movies/items to be sampled
            #local split 
            settrain = set()
            #adds additional 1 min per model
            for j in range(nusers): #loop through movie columns
            #list(potentialtrain)# too much time to slice
                train_indices = np.argwhere(trainX[:,j])[:,0] #get non-zero indices which are legal 
                
                cand_indices = potentialtrain.intersection(set(train_indices))
                settrain.update(random.sample(cand_indices, min(i,len(cand_indices))))
                

            print("CV size",len(setcv)/(len(settrain)+len(setcv)))
            
            print("model_" + sample_by_ +"_"+ str(i))
            model_run["model_" + sample_by_ +"_"+ str(i)] = fm_model(setcv,settrain,trainX,testX,trainY,testY)        
            gc.collect()
        
        return model_run


In [42]:
def fm_model(setcv,settrain,trainX,testX,trainY,testY):

    '''
    
    return:
        dict of dict  to store size, time for run, train error, test error etc.
    '''
    
    ##Optimise with cv

    model_summary = {}
    _ = "_"
    
    
    ##Get baseline once
    #https://surprise.readthedocs.io/en/stable/
                #basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly
    
    
    
    itercnt = 0
    ##24 models : hyperspace
    for n_iter in np.arange(50,101,25):
        for rank in  np.arange(2,15,3):
            for l2_reg in np.logspace(-4, 1, 5): 
                ##future plan : further granular level regularization
                    
                start = time.time()
                
                #als : coordinate descent
                estimator_x = als.FMRegression(n_iter = int(n_iter), rank = int(rank), l2_reg = l2_reg)
                estimator_x.fit(trainX[list(settrain)],trainY[list(settrain)])
                
                #estimator_x.fit(trainX[list(settrain)],trainY[list(settrain)])
                
                
                
                ###get RMSE
                train_rmse = mean_squared_error(estimator_x.predict(trainX[list(settrain)]),trainY[list(settrain)])**0.5
                cv_rmse =    mean_squared_error(estimator_x.predict(trainX[list(setcv)]),trainY[list(setcv)])**0.5
                
                #not being used for evaluation *********
                test_rmse = mean_squared_error(estimator_x.predict(testX),testY)**0.5
                
                #average rating of the train set
                baseline = np.mean(trainY[list(settrain)])
                train_baseline = mean_squared_error(np.repeat(baseline,len(testY)),testY)**0.5
                
                #Model summary to be returned for all model
                itercnt += 1
                model_ = {
                'train_rmse' : train_rmse,
                'test_rmse' : test_rmse,
                'cv_rmse' : cv_rmse,
                'train_baseline_rmse' : train_baseline,
                'model_obj' : estimator_x,
                'n_iter' : n_iter,
                'rank' :  rank, #k
                'l2_reg' : l2_reg,
                'train_size':len(settrain),
                #'l2_reg_V':l2_reg_V, #future
                #'l2_reg_w' : l2_reg_w,
                    
                'time' : time.time() - start
                    }
                
                print("\n Model " + str(itercnt) + "done." + "\t CV RMSE: " + str(cv_rmse))
                model_summary["model_seq_ "+str(itercnt) + _ + str(n_iter) +_+ str(rank) +_+ str(l2_reg)] = model_


    
    return model_summary

In [13]:
gc.collect()

1204

#### Subsample data from small to large

In [26]:
###store function return
#trainX, testX, trainY, testY = load_data_simple('./ml-1m/ratings.dat')
basic_model = subsample_user_item('./ml-1m/ratings.dat')
#pickle object for later use

model_All_0.2

 Model 1done.	 CV RMSE: 2.478559963716515

 Model 2done.	 CV RMSE: 1.485147089217713

 Model 3done.	 CV RMSE: 1.0879465799151953

 Model 4done.	 CV RMSE: 0.9652737012755883

 Model 5done.	 CV RMSE: 2.0180079526330337

 Model 6done.	 CV RMSE: 1.5913491622411378

 Model 7done.	 CV RMSE: 1.2900936026463659

 Model 8done.	 CV RMSE: 1.0302302013398854

 Model 9done.	 CV RMSE: 1.8230058032498637

 Model 10done.	 CV RMSE: 1.6620316042378767

 Model 11done.	 CV RMSE: 1.4481483260207597

 Model 12done.	 CV RMSE: 1.0827463293339732

 Model 13done.	 CV RMSE: 2.7997272069163643

 Model 14done.	 CV RMSE: 1.5683450980617133

 Model 15done.	 CV RMSE: 1.1043621199599958

 Model 16done.	 CV RMSE: 0.9641183216173389

 Model 17done.	 CV RMSE: 2.238196745801011

 Model 18done.	 CV RMSE: 1.7153528520763002

 Model 19done.	 CV RMSE: 1.318828446764848

 Model 20done.	 CV RMSE: 1.0310090537919052

 Model 21done.	 CV RMSE: 1.965771861554259

 Model 22done.	 CV RMSE: 1.7833070775883046

 Model 23

In [32]:
pickle.dump(basic_model,open("./dump_basic_model","wb"))

In [33]:
#check best RMSE
m = {}
mi = 10.0

for key in basic_model.keys():
    for k2 in basic_model[key].keys():
        if basic_model[key][k2]["cv_rmse"] <= mi:
            mi = basic_model[key][k2]["cv_rmse"]
            m[basic_model[key][k2]["cv_rmse"]] = key+"_"+k2
        
        
print("Best model: ",m[mi],"\t","best cv error",mi)     

Best model:  model_All_1.0_model_seq_ 36_100_6_1.0 	 best cv error 0.8656917896423594


#### Subsample user/items from sparsely populated to well populated

##### Items

In [27]:
#subsample user/items
#Same performance graphs for low number of items to high, same for users.

##- where do they fail? where do they work best?
basic_model_sub_items = subsample_user_item('./ml-1m/ratings.dat','Items')
#basic_model_sub_users = subsample_user_item('./ml-1m/ratings.dat','Users')

CV size 0.9607146287580444
model_Items_1

 Model 1done.	 CV RMSE: 1.3694701811217187

 Model 2done.	 CV RMSE: 1.320876687383279

 Model 3done.	 CV RMSE: 1.164024502510841

 Model 4done.	 CV RMSE: 1.1112400903834134

 Model 5done.	 CV RMSE: 1.3694278036638885

 Model 6done.	 CV RMSE: 1.320601609877505

 Model 7done.	 CV RMSE: 1.164099796858004

 Model 8done.	 CV RMSE: 1.1110549575926465

 Model 9done.	 CV RMSE: 1.3692149451355689

 Model 10done.	 CV RMSE: 1.3201027201463964

 Model 11done.	 CV RMSE: 1.1640791123431269

 Model 12done.	 CV RMSE: 1.1110065192998009

 Model 13done.	 CV RMSE: 1.367972004425819

 Model 14done.	 CV RMSE: 1.300597213170513

 Model 15done.	 CV RMSE: 1.160631665715515

 Model 16done.	 CV RMSE: 1.1112404406604677

 Model 17done.	 CV RMSE: 1.367860051871458

 Model 18done.	 CV RMSE: 1.3005040077418824

 Model 19done.	 CV RMSE: 1.160688884485152

 Model 20done.	 CV RMSE: 1.1110555089993142

 Model 21done.	 CV RMSE: 1.3676626645087957

 Model 22done.	 CV RMSE: 1.3001

CV size 0.09999911129675135
model_Items_inf

 Model 1done.	 CV RMSE: 0.8894064922661781

 Model 2done.	 CV RMSE: 0.8811486205492839

 Model 3done.	 CV RMSE: 0.8761648500417049

 Model 4done.	 CV RMSE: 0.8721195881985351

 Model 5done.	 CV RMSE: 0.8825192597053337

 Model 6done.	 CV RMSE: 0.8812291476653358

 Model 7done.	 CV RMSE: 0.8747651431189952

 Model 8done.	 CV RMSE: 0.8626779771362061

 Model 9done.	 CV RMSE: 0.9028473298863007

 Model 10done.	 CV RMSE: 0.9008236623206356

 Model 11done.	 CV RMSE: 0.8894449166289823

 Model 12done.	 CV RMSE: 0.8629279197371013

 Model 13done.	 CV RMSE: 0.8961339256466629

 Model 14done.	 CV RMSE: 0.8805283172206883

 Model 15done.	 CV RMSE: 0.8762263895955366

 Model 16done.	 CV RMSE: 0.8720965951591964

 Model 17done.	 CV RMSE: 0.886739682883953

 Model 18done.	 CV RMSE: 0.8833398851111052

 Model 19done.	 CV RMSE: 0.8754480229463377

 Model 20done.	 CV RMSE: 0.8627044967116283

 Model 21done.	 CV RMSE: 0.9089605518296667

 Model 22done.	 CV R

In [31]:
pickle.dump(basic_model_sub_items,open("./dump_basic_model_sub_items","wb"))

In [28]:
#check best RMSE
m = {}
mi = 10.0

for key in basic_model_sub_items.keys():
    for k2 in basic_model_sub_items[key].keys():
        if basic_model_sub_items[key][k2]["cv_rmse"] <= mi:
            mi = basic_model_sub_items[key][k2]["cv_rmse"]
            m[basic_model_sub_items[key][k2]["cv_rmse"]] = key+"_"+k2
        
        
print("Best model: ",m[mi],"\t","best cv error",mi)     

Best model:  model_Items_inf_model_seq_ 8_50_4_1.0 	 best cv error 0.8626779771362061


##### Users

In [29]:
basic_model_sub_users = subsample_user_item('./ml-1m/ratings.dat','Users')

CV size 0.9371213225343022
model_Users_1

 Model 1done.	 CV RMSE: 1.4757187828208054

 Model 2done.	 CV RMSE: 1.406855552533479

 Model 3done.	 CV RMSE: 1.1756997073724276

 Model 4done.	 CV RMSE: 1.0935054747553064

 Model 5done.	 CV RMSE: 1.4758751133768713

 Model 6done.	 CV RMSE: 1.4064121261726847

 Model 7done.	 CV RMSE: 1.1755697600960564

 Model 8done.	 CV RMSE: 1.0914237640358124

 Model 9done.	 CV RMSE: 1.4756229345318919

 Model 10done.	 CV RMSE: 1.4054699031320517

 Model 11done.	 CV RMSE: 1.1754443582228566

 Model 12done.	 CV RMSE: 1.0905740345797992

 Model 13done.	 CV RMSE: 1.4736380832663967

 Model 14done.	 CV RMSE: 1.3768998987638943

 Model 15done.	 CV RMSE: 1.175203810594718

 Model 16done.	 CV RMSE: 1.0935008434426992

 Model 17done.	 CV RMSE: 1.4737134264992648

 Model 18done.	 CV RMSE: 1.3767478525788563

 Model 19done.	 CV RMSE: 1.1749597861786043

 Model 20done.	 CV RMSE: 1.0914226867043093

 Model 21done.	 CV RMSE: 1.47345612520456

 Model 22done.	 CV RMSE: 1

CV size 0.09999911129675135
model_Users_inf

 Model 1done.	 CV RMSE: 0.8809537257541866

 Model 2done.	 CV RMSE: 0.8802820783007432

 Model 3done.	 CV RMSE: 0.8758564835677176

 Model 4done.	 CV RMSE: 0.8715321985669836

 Model 5done.	 CV RMSE: 0.8902861071767592

 Model 6done.	 CV RMSE: 0.8880975483820251

 Model 7done.	 CV RMSE: 0.8784470578110511

 Model 8done.	 CV RMSE: 0.865361672135349

 Model 9done.	 CV RMSE: 0.9101816832408643

 Model 10done.	 CV RMSE: 0.9061032402332597

 Model 11done.	 CV RMSE: 0.8915932891521112

 Model 12done.	 CV RMSE: 0.8640353470493134

 Model 13done.	 CV RMSE: 0.882817298689741

 Model 14done.	 CV RMSE: 0.8812900419263673

 Model 15done.	 CV RMSE: 0.8756543511142498

 Model 16done.	 CV RMSE: 0.8715252432324337

 Model 17done.	 CV RMSE: 0.8923836998200364

 Model 18done.	 CV RMSE: 0.8899498328808857

 Model 19done.	 CV RMSE: 0.8786310680840406

 Model 20done.	 CV RMSE: 0.8649227004596103

 Model 21done.	 CV RMSE: 0.9129080834821295

 Model 22done.	 CV RM

In [30]:
pickle.dump(basic_model_sub_users,open("./dump_basic_model_sub_users","wb"))

In [34]:
#check best RMSE
m = {}
mi = 10.0

for key in basic_model_sub_users.keys():
    for k2 in basic_model_sub_users[key].keys():
        if basic_model_sub_users[key][k2]["cv_rmse"] <= mi:
            mi = basic_model_sub_users[key][k2]["cv_rmse"]
            m[basic_model_sub_users[key][k2]["cv_rmse"]] = key+"_"+k2
        
        
print("Best model: ",m[mi],"\t","best cv error",mi)     

Best model:  model_Users_inf_model_seq_ 36_100_6_1.0 	 best cv error 0.8637394805622824


### Regressor 2: Side information

In [18]:
def load_data_side_info(ratings_file_,user_file_,movie_file_):
    
    '''
    Input 
    
        File_ : file name of the ratings file
    
    Output : test - train dfs
    '''

    users = pd.read_csv(user_file_, sep='::', names=['userId', 'gender', 'age', 'occupation', 'zip'], \
                    header=None)

    movies = pd.read_csv(movie_file_, sep='::', names=['movieId', 'title', 'genres'], header=None)
    movies.head()
    movies['year'] = movies.title.apply(lambda x : x[-5:-1])
    
    sparse_genres = pd.DataFrame(CountVectorizer().fit_transform(movies.genres\
                                    .map(lambda x: x.replace('|', ' '))).todense())
    
    movies = pd.concat([movies[['movieId']], sparse_genres], axis=1) 
    
    
    ratings = np.genfromtxt(ratings_file_,delimiter="::")
    ratings =  pd.DataFrame(ratings)
    
    ratings.columns = ['userId','movieId','rating','timestamp']
    ratings = ratings.drop('timestamp', axis=1)
    
    ratings = pd.merge(pd.merge(ratings, users, on='userId'), movies, on='movieId')
    
    y = ratings['rating'].values
    X = ratings.drop('rating', axis=1)
    for feature in X.columns:
        _,X[feature] = np.unique(X[feature], return_inverse=True)
    
    encoder = encode(X)
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.20, random_state=1234)
    
    
    ##fixed
    #trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.10, random_state=1234)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    
    return (X.userId.nunique(),X.movieId.nunique()),trainX, testX, trainY, testY

#### Add feature 1 : performe the test1 and test2

#### Running it for 150 iterations

In [38]:
###store function return
#trainX, testX, trainY, testY = load_data_simple('./ml-1m/ratings.dat')
files = (('./ml-1m/ratings.dat'),('./ml-1m/users.dat'),('./ml-1m/movies.dat'))
add_info_model = subsample_user_item(files,model_type = "side_info")
#pickle object for later use

  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


model_All_0.2

 Model 1done.	 CV RMSE: 1.8361533643150856

 Model 2done.	 CV RMSE: 1.4190197950185062

 Model 3done.	 CV RMSE: 1.1317654798743346

 Model 4done.	 CV RMSE: 0.9849017276244169

 Model 5done.	 CV RMSE: 2.4580026018095493

 Model 6done.	 CV RMSE: 1.5830739508574916

 Model 7done.	 CV RMSE: 1.268978146697139

 Model 8done.	 CV RMSE: 1.044802179011477

 Model 9done.	 CV RMSE: 1.8815711273305318

 Model 10done.	 CV RMSE: 1.7017528980470988

 Model 11done.	 CV RMSE: 1.4064450187758057

 Model 12done.	 CV RMSE: 1.0971355755595265

 Model 13done.	 CV RMSE: 2.146025364368289

 Model 14done.	 CV RMSE: 1.510842409168484

 Model 15done.	 CV RMSE: 1.1491794866409097

 Model 16done.	 CV RMSE: 0.985844958843531

 Model 17done.	 CV RMSE: 2.5961793613528292

 Model 18done.	 CV RMSE: 1.6571141460662981

 Model 19done.	 CV RMSE: 1.316940493779122

 Model 20done.	 CV RMSE: 1.0467791973244598

 Model 21done.	 CV RMSE: 2.0848317692516636

 Model 22done.	 CV RMSE: 1.8549660131527201

 Model 23d


 Model 4done.	 CV RMSE: 0.8895153104777785

 Model 5done.	 CV RMSE: 0.9412029283354142

 Model 6done.	 CV RMSE: 0.9304536036355996

 Model 7done.	 CV RMSE: 0.911453312333971

 Model 8done.	 CV RMSE: 0.8864063772439899

 Model 9done.	 CV RMSE: 1.0194074490434102

 Model 10done.	 CV RMSE: 0.9928474595555475

 Model 11done.	 CV RMSE: 0.9413568015289463

 Model 12done.	 CV RMSE: 0.8929804139676623

 Model 13done.	 CV RMSE: 0.913688194844385

 Model 14done.	 CV RMSE: 0.9082610894736621

 Model 15done.	 CV RMSE: 0.8997842834392341

 Model 16done.	 CV RMSE: 0.8880172702663953

 Model 17done.	 CV RMSE: 0.9503195969633562

 Model 18done.	 CV RMSE: 0.9361064630940142

 Model 19done.	 CV RMSE: 0.9122744484111537

 Model 20done.	 CV RMSE: 0.8855476802905381

 Model 21done.	 CV RMSE: 1.0423540036182752

 Model 22done.	 CV RMSE: 1.0057491325320174

 Model 23done.	 CV RMSE: 0.9447858482355536

 Model 24done.	 CV RMSE: 0.8914790727211762

 Model 25done.	 CV RMSE: 0.9155484558308289

 Model 26done.	 C

In [39]:
#check best RMSE
m = {}
mi = 10.0

for key in add_info_model.keys():
    for k2 in add_info_model[key].keys():
        if add_info_model[key][k2]["cv_rmse"] <= mi:
            mi = add_info_model[key][k2]["cv_rmse"]
            m[add_info_model[key][k2]["cv_rmse"]] = key+"_"+k2
        
        
print("Best model: ",m[mi],"\t","best cv error",mi)     

Best model:  model_All_1.0_model_seq_ 56_150_4_1.0 	 best cv error 0.875836691458815


In [41]:
pickle.dump(add_info_model,open("./add_info_model","wb"))

#### increasing hyperspace, reducing iterations to 100

In [44]:
###store function return
#trainX, testX, trainY, testY = load_data_simple('./ml-1m/ratings.dat')
files = (('./ml-1m/ratings.dat'),('./ml-1m/users.dat'),('./ml-1m/movies.dat'))
add_info_model_morekreg = subsample_user_item(files,model_type = "side_info")
#pickle object for later use

  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


model_All_0.2

 Model 1done.	 CV RMSE: 1.828518510476075

 Model 2done.	 CV RMSE: 1.4386607472110768

 Model 3done.	 CV RMSE: 1.1671672998722333

 Model 4done.	 CV RMSE: 1.001282230148569

 Model 5done.	 CV RMSE: 0.9232263636185501

 Model 6done.	 CV RMSE: 1.7632220624881716

 Model 7done.	 CV RMSE: 1.6110061000171694

 Model 8done.	 CV RMSE: 1.3541147258555573

 Model 9done.	 CV RMSE: 1.112966826353428

 Model 10done.	 CV RMSE: 0.9440743672604938

 Model 11done.	 CV RMSE: 1.799611073805364

 Model 12done.	 CV RMSE: 1.6867433879321703

 Model 13done.	 CV RMSE: 1.4910621561131312

 Model 14done.	 CV RMSE: 1.1932522855788679

 Model 15done.	 CV RMSE: 0.9538086825148885

 Model 16done.	 CV RMSE: 1.9200610589945633

 Model 17done.	 CV RMSE: 1.7812199339282107

 Model 18done.	 CV RMSE: 1.578837664609271

 Model 19done.	 CV RMSE: 1.2618391560037474

 Model 20done.	 CV RMSE: 0.9688001363183516

 Model 21done.	 CV RMSE: 1.8873967986618156

 Model 22done.	 CV RMSE: 1.7979085615920658

 Model 23


 Model 34done.	 CV RMSE: 0.9208305950247402

 Model 35done.	 CV RMSE: 0.8763600291094793

 Model 36done.	 CV RMSE: 1.2277646414314705

 Model 37done.	 CV RMSE: 1.2005028852559152

 Model 38done.	 CV RMSE: 1.0774396458870727

 Model 39done.	 CV RMSE: 0.9575723024434802

 Model 40done.	 CV RMSE: 0.8784657676046138

 Model 41done.	 CV RMSE: 1.2931613730036682

 Model 42done.	 CV RMSE: 1.2591099167275233

 Model 43done.	 CV RMSE: 1.145075185874487

 Model 44done.	 CV RMSE: 0.9970978159998808

 Model 45done.	 CV RMSE: 0.886021343528304

 Model 46done.	 CV RMSE: 1.3572452554228787

 Model 47done.	 CV RMSE: 1.3331622126909075

 Model 48done.	 CV RMSE: 1.2204559167821944

 Model 49done.	 CV RMSE: 1.0386723690200996

 Model 50done.	 CV RMSE: 0.892821102000648

 Model 51done.	 CV RMSE: 0.9933170291877568

 Model 52done.	 CV RMSE: 0.9524353837299722

 Model 53done.	 CV RMSE: 0.9185540284138387

 Model 54done.	 CV RMSE: 0.901269490767689

 Model 55done.	 CV RMSE: 0.8875673102712153

 Model 56done


 Model 67done.	 CV RMSE: 1.0493606698805287

 Model 68done.	 CV RMSE: 0.996720615080953

 Model 69done.	 CV RMSE: 0.916527486007083

 Model 70done.	 CV RMSE: 0.8540838671576749

 Model 71done.	 CV RMSE: 1.1357729575298818

 Model 72done.	 CV RMSE: 1.1186259728543406

 Model 73done.	 CV RMSE: 1.0518895605631233

 Model 74done.	 CV RMSE: 0.9437555998169164

 Model 75done.	 CV RMSE: 0.8590930371564496


In [45]:
#check best RMSE
m = {}
mi = 10.0

for key in add_info_model_morekreg.keys():
    for k2 in add_info_model_morekreg[key].keys():
        if add_info_model_morekreg[key][k2]["cv_rmse"] <= mi:
            mi = add_info_model_morekreg[key][k2]["cv_rmse"]
            m[add_info_model_morekreg[key][k2]["cv_rmse"]] = key+"_"+k2
        
        
print("Best model: ",m[mi],"\t","best cv error",mi)     

Best model:  model_All_1.0_model_seq_ 65_100_8_10.0 	 best cv error 0.8540233748897804


In [46]:
pickle.dump(add_info_model_morekreg,open("./add_info_model_morekreg","wb"))