In [2]:
import pandas as pd;
import numpy as np;
import time;
import sagemaker as sg;
import scipy;
import csv;
import xlearn as xl;
import random;
from tqdm import tqdm;
from sklearn.model_selection import train_test_split;
import numpy as np;
from sklearn.metrics import mean_squared_error;
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


```
dataLink :  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
groupLens: https://grouplens.org/datasets/movielens/
readMe: http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html
```

### Summary
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv. More details about the contents and use of all these files follows.

This is a development dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available benchmark datasets if that is your intent.

This and other GroupLens data sets are publicly available for download at http://grouplens.org/datasets/.

In [12]:
#genome_scores = pd.read_csv('ml-latest/genome-scores.csv')
#genome_tags = pd.read_csv('ml-latest/genome-tags.csv')
#links = pd.read_csv('ml-latest-small/links.csv')
#movies = pd.read_csv('ml-1m/movies.dat')
#ratings = pd.read_csv('ml-1m/ratings.dat')
#users = pd.read_csv('ml-1m/users.dat')

##### Preparing data in libsvm & libffm format

In [124]:
###Convert data to libsvm format


### reading & convert ratings file

def convert_ratings_to_fm(fin,fout,feature_index,rating_index,_model = "fm"):
    '''
    Input : ratings file with columns in the following order
            1) user_id
            2) movie_id
            3) rating
            4) timestamp - Ignoring this column for now
            
    Arguments : fin : input ratings file
                fout : output file name - column indices to be included
                column index containing the rating
                _model : ffm/fm
    
    
    Output:
            ratings matrix transformed to libsvm
    
    '''
    
    rat_file = open(fin,'r')  #input file
    text_file = open(fout,'w') #output file
    
    
    if _model=="ffm":
        add_field = 1
    else:
        add_field = 0
    
    
    #Initialize ::
    val0 = rat_file.readline();
    split_row0 = val0.split('::')
    datastring = ""  #stores the final string
    indx_cntr = 0
    d_field = {}

    ###User
    d_field[feature_index[0]] = {split_row0[0] : indx_cntr}
    indx_cntr = indx_cntr + 1
    ###movie
    d_field[feature_index[1]] = {split_row0[1] : indx_cntr}
    indx_cntr = indx_cntr + 1
    ###first string
    ###rating
    datastring += str(int(split_row0[rating_index]))
    ###user   
    datastring += "," + ("0" + ":") * add_field + str(d_field[0][split_row0[0]]) + ":" + "1"
    ###movie
    datastring += "," + ("1" + ":") * add_field + str(d_field[1][split_row0[1]]) + ":" + "1"
    
    datastring += "\n"
    text_file.write(datastring) 
    
    #iterate over all the lines
    for val in rat_file.readlines():

        #split each row
        split_row = val.split('::')
        #rating
        datastring = str(int(split_row[rating_index]))
        for col in feature_index: #ignoring timestamp, rating
           
            #if a new user/movie found, add it to dictionary
            if d_field[col].get(split_row[col],None) == None:
                d_field[col][split_row[int(col)]] = indx_cntr
                indx_cntr += 1

            datastring += "," + (str(col) + ":") * add_field + str(d_field[col][split_row[col]]) + ":" + "1"
        datastring += "\n"
        text_file.write(datastring)        
         
    text_file.close()     

##### Running a basic model to check if the code is running

In [126]:
convert_ratings_to_fm("ml-1m/ratings.dat","ratings_v1",[0,1],2,_model = "fm")
fm_model = xl.create_fm()
fm_model.setTrain("./fout")
#fm_model.setValidate("./small_test.txt")
param = {'task':'reg', 'lr':0.2, 'lambda':0.002}

fm_model.fit(param, "./model.out")

In [128]:
convert_ratings_to_fm("ml-1m/ratings.dat","ratings_v1",[0,1],2,_model = "ffm")
ffm_model = xl.create_ffm()
ffm_model.setTrain("./testffm.txt")
#fm_model.setValidate("./small_test.txt")
param = {'task':'reg', 'lr':0.2, 'lambda':0.002}

ffm_model.fit(param, "./model.out")

In [140]:
convert_ratings_to_fm("ml-1m/ratings.dat","ratings_v1",[0,1],2,_model = "fm")
convert_ratings_to_fm("ml-1m/ratings.dat","ratings_v1",[0,1],2,_model = "ffm")

##### Creating a train - test split 

In [222]:
def train_test_split(fname,test_percent,delimiter,seed = 242):
    
    '''
    Input : File for ratings
    
    Output : test/train files with "train_" + fname & "test_" + fname, returns True if split is correct, 
             False otherwise
    
    Argument : fname : File which has the ratings
               test_percent : percentage of ratings in test
               seed : seed
               delimiter : delimeter of the ratings file
                  
    '''
    
    file = open(fname)
    x = file.readlines()
    
    test_indices = random.sample(k = int(len(x) * test_percent), population = range(len(x)))
    train_indices = set(range(len(x))) - set(test_indices)
    
    tmp = fname.split(".")[0]
    
    f_train = open(tmp + "train.dat","w")
    [f_train.write(x[i]) for i in train_indices]
    f_train.close()
    
    
    
    f_test = open(tmp + "test.dat","w")
    [f_test.write(x[i]) for i in test_indices]
    f_test.close()
    
       
    x = np.genfromtxt(tmp + "train.dat",delimiter = delimiter)
    y = np.genfromtxt(tmp + "test.dat",delimiter = delimiter)
    
    
    ##checks if train user items are superset of y
    return set(np.unique(x[:,0])).issuperset(np.unique(y[:,0]))

In [224]:
#create test control
train_test_split("ml-1m/ratings.dat",.20,"::")

True

In [None]:
#get_rmse
mean_squared_error(y_true, ypred)



### (A) : Attempting a simple Factorization machine on 1 Million dataset - with just User X Movie

In [None]:



param = {
        }



fm_A_model.setTest("ml-1m/ratingstest.dat")
#fm_A_model.setSigmoid()
fm_A_model.predict("Models/model_fm_A.out", "output.txt")

In [None]:
param = {
     'task':'reg', # ‘binary’ for classification, ‘reg’ for Regression
     'k':2,           # Size of latent factor
     'lr':0.1,        # Learning rate for GD
     'lambda':0.0002, # L2 Regularization Parameter
     'Metric':'auc',  # Metric for monitoring validation set performance
     'epoch':25  ,     # Maximum number of Epochs
     'opt':'sgd'      #optimization method
     
     
     # hyperameters
     ,'alpha':0.002, 'beta':0.8, 'lambda_1':0.001, 'lambda_2': 1.0
      }

In [None]:
fm_A_model = xl.create_fm()
fm_A_model.setTrain("ml-1m/ratingstrain.dat")
fm_A_model.cv(param)

In [None]:
def score(params):
    print "Training with params : "
    print params
    
    fm_A_model = xl.create_fm()
    fm_A_model.setTrain("ml-1m/ratingstrain.dat")
    fm_A_model.cv(param)
    
    
    
    
    print "\tScore {0}\n\n".format(score)
    return {'loss': score, 'status': STATUS_OK}

def optimize(trials):
    space = {
         'task':'reg', # ‘binary’ for classification, ‘reg’ for Regression
         'k':2,           # Size of latent factor
         'lr':0.1,        # Learning rate for GD
         'lambda':0.0002, # L2 Regularization Parameter
         'Metric':'auc',  # Metric for monitoring validation set performance
         'epoch':25  ,     # Maximum number of Epochs
         'opt':'sgd'      #optimization method
         
         
         # hyperameters
         ,'alpha':0.002, 'beta':0.8, 'lambda_1':0.001, 'lambda_2': 1.0
          }
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print best



X, y = trains,target
print "Splitting data into train and valid ...\n\n"
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234)

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)


##### Benchmark it with Matrix factorization

In [229]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


# Load the movielens-100k dataset (download it if needed),
#data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()


# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] n
Ok then, I'm out!


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:

ffm_model = xl.create_ffm()
ffm_model.setTrain("./small_train.txt")
ffm_model.setValidate("./small_test.txt")
ffm_model.setPreModel("./pre_model")
param = {'task':'binary', 'lr':0.2, 'lambda':0.002}

ffm_model.fit(param, "./model.out")