In [6]:
import numpy as np
import pandas as pd
#import load_problems
import cPickle as pickle
from sklearn.metrics import roc_auc_score, mean_squared_error

In [83]:
from fastFM.mcmc import FMClassification, FMRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import train_test_split
from pyfm import pylibfm

In [85]:
import sys

In [45]:
# Read in data #先不用
def loadData(filename,path="ml-100k/"):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        for line in f:
            (user,movieid,rating,ts)=line.split('\t')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            y.append(float(rating))
            users.add(user)
            items.add(movieid)

    return (data, np.array(y), users, items)

In [None]:
LIBFM_PATH = '/moosefs/ipython_env/python_libfm/bin/libFM'
PYLIBFM_PATH = '/moosefs/ipython_env/python_pylibFM/'

import sys
if PYLIBFM_PATH not in sys.path:
    sys.path.insert(0, PYLIBFM_PATH)
import pylibfm


In [11]:
def fitpredict_logistic(trainX, trainY, testX, classification=True, **params):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = LogisticRegression(**params)
        clf.fit(trainX, trainY)
        return clf.predict_proba(testX)[:, 1]
    else:
        clf = Ridge(**params)
        clf.fit(trainX, trainY)
        return clf.predict(testX)


In [14]:
def fitpredict_fastfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = FMClassification(rank=rank, n_iter=n_iter)
        return clf.fit_predict_proba(trainX, trainY, testX)
    else:
        clf = FMRegression(rank=rank, n_iter=n_iter)
        return clf.fit_predict(trainX, trainY, testX) 

In [12]:
def fitpredict_libfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    train_file = 'libfm_train.txt'
    test_file = 'libfm_test.txt'
    with open(train_file, 'w') as f:
        dump_svmlight_file(trainX, trainY, f=f)
    with open(test_file, 'w') as f:
        dump_svmlight_file(testX, numpy.zeros(testX.shape[0]), f=f)
    task = 'c' if classification else 'r'
    console_output = !$LIBFM_PATH -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -out output.libfm
    
    libfm_pred = pandas.read_csv('output.libfm', header=None).values.flatten()
    return libfm_pred


In [44]:
def fitpredict_pylibfm(trainX, trainY, testX, classification=True, rank=8, n_iter=10):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    task = 'classification' if classification else 'regression'
    fm = pylibfm.FM(num_factors=rank, num_iter=n_iter, verbose=False, task=task)
    if classification:
        fm.fit(trainX, trainY)
    else:
        fm.fit(trainX, trainY * 1.)
    return fm.predict(testX)

In [88]:
from collections import OrderedDict
import time

all_results = OrderedDict()
try:
    with open('./saved_results.pkl') as f:
        all_results = pickle.load(f)
except:
    pass

def test_on_dataset(trainX, testX, trainY, testY, task_name, classification=True, use_pylibfm=True):
    algorithms = OrderedDict()
    algorithms['logistic'] = fitpredict_logistic
    #algorithms['libFM']    = fitpredict_libfm
    algorithms['fastFM']   = fitpredict_fastfm
    if use_pylibfm:
        algorithms['pylibfm']  = fitpredict_pylibfm
    
    results = pandas.DataFrame()
    for name, fit_predict in algorithms.items():
        start = time.time()
        predictions = fit_predict(trainX, trainY, testX, classification=classification)
        spent_time = time.time() - start
        results.ix[name, 'time'] = spent_time
        if classification:
            results.ix[name, 'ROC AUC'] = roc_auc_score(testY, predictions)
        else:
            results.ix[name, 'RMSE'] = numpy.mean((testY - predictions) ** 2) ** 0.5
            
    all_results[task_name] = results
    with open('saved_results.pkl', 'w') as f:
        pickle.dump(all_results, f)
        
    return results

In [28]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, parse_dates=True) 

In [29]:
rating_cols = ['user_id', 'items_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=rating_cols)

In [30]:
movie_cols = ['items_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure',
              'Animation',"Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
              'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi', 
              'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movie_cols ,encoding='latin-1')

In [36]:
movie_ratings = pd.merge(movies, ratings)
df = pd.merge(users, movie_ratings)

In [7]:
#df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)

In [37]:
n_users = df.user_id.unique().shape[0]
n_items = df.items_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)

Number of users = 943 | Number of movies = 1682


In [38]:
#from sklearn import cross_validation as cv
#train, test_data = cv.train_test_split(df, test_size=0.25)
#之後請改用model_selection

In [51]:
df.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,items_id,title,release_date,video_release_date,imdb_url,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating,unix_timestamp
0,1,24,M,technician,85711,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,...,0,0,0,0,0,0,0,0,5,874965758
1,1,24,M,technician,85711,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,...,0,0,0,0,0,1,0,0,3,876893171
2,1,24,M,technician,85711,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,...,0,0,0,0,0,1,0,0,4,878542960
3,1,24,M,technician,85711,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,...,0,0,0,0,0,0,0,0,3,876893119
4,1,24,M,technician,85711,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),...,0,0,0,0,0,1,0,0,3,889751712


In [54]:
df.dtypes

user_id                 int64
age                     int64
sex                    object
occupation             object
zip_code               object
items_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
unknown                 int64
Action                  int64
Adventure               int64
Animation               int64
Children's              int64
Comedy                  int64
Crime                   int64
Documentary             int64
Drama                   int64
Fantasy                 int64
Film-Noir               int64
Horror                  int64
Musical                 int64
Mystery                 int64
Romance                 int64
Sci-Fi                  int64
Thriller                int64
War                     int64
Western                 int64
rating                  int64
unix_timestamp          int64
dtype: object

In [76]:
a = df.dtypes != "object"
df2 = df[a[a.values == True].index].head()
df2 = df2.drop(["video_release_date"],axis = 1)

In [78]:
trainX, testX, trainY, testY = train_test_split(df2.drop(["items_id"],axis = 1), df2.items_id, test_size = 0.3)

In [90]:
trainX.head()

Unnamed: 0,user_id,age,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating,unix_timestamp
0,1,24,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,5,874965758
1,1,24,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,3,876893171
2,1,24,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,4,878542960


In [89]:
test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k, ids', classification=False)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


Unnamed: 0,time,RMSE
logistic,77.526744,2.437634
fastFM,73.389579,3.417266
pylibfm,79.80395,3.302464
