In [1]:
import pandas as pd;
import numpy as np;
import time;
import sagemaker as sg;
import scipy;
import csv;
import xlearn as xl;
import random;


```
dataLink :  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
groupLens: https://grouplens.org/datasets/movielens/
readMe: http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html
```

### Summary
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv. More details about the contents and use of all these files follows.

This is a development dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available benchmark datasets if that is your intent.

This and other GroupLens data sets are publicly available for download at http://grouplens.org/datasets/.

In [2]:
#genome_scores = pd.read_csv('ml-latest/genome-scores.csv')
#genome_tags = pd.read_csv('ml-latest/genome-tags.CSV')
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

##### Preparing data in libsvm & libffm format

In [3]:
###Convert data to libsvm format

def convert_to_fm(filename,df,_type,features,_model = "fm",label = "rating"):
    
    #store fields for ffm
    df = df[features]
    
    
    #get total number of indices required
    field_list = list(df.columns)
    field_list.remove(label)
    
    #index count
    ind_cnt = -1
    
    #ind_dict is a dictionary of dictionary
    ind_dict = {}
    
    for col in  field_list:
        sub_dict = {}
        #maintains unique indices for unique values in each of the field
        sub_dict = dict(zip(df[col].unique(),map(lambda x : x + ind_cnt + 1,range(df[col].nunique()))))
        #sub_dict should have global indices against each unique value in the field
        
        ind_cnt += df[col].nunique()
        ind_dict[col] = sub_dict
    
    if _model=="ffm":
        add_field = 1
    else:
        add_field = 0
    
    nrow = df.shape[0]
    
    
    #write into a text file : ffm/fm processed data
    #val = f.split(",")
    
    text_file = open(filename,'w')
       
    for r in range(nrow):
        datastring = ""
        x = df.iloc[r].to_dict()
        datastring += str(int(x['rating']))
        x.pop('rating',None)
        
        for c in x.keys():
            datastring += "," + (c + ":") * add_field + str(int(ind_dict[c][x[c]])) + ":" + "1"
          
        datastring += "\n"
        text_file.write(datastring)
        
    text_file.close()

##### Running a basic model

In [14]:
convert_to_fm('test.txt',ratings,"text",['userId', 'movieId', 'rating'])
fm_model = xl.create_fm()
fm_model.setTrain("./test.txt")
#fm_model.setValidate("./small_test.txt")
param = {'task':'reg', 'lr':0.2, 'lambda':0.002}

fm_model.fit(param, "./model.out")

In [16]:
convert_to_fm('testffm.txt',ratings,"text",['userId', 'movieId', 'rating'])
ffm_model = xl.create_ffm()
ffm_model.setTrain("./testffm.txt")
#fm_model.setValidate("./small_test.txt")
param = {'task':'reg', 'lr':0.2, 'lambda':0.002}

ffm_model.fit(param, "./model.out")

##### Benchmark it with Matrix factorization