In [None]:
import numpy as np
import tensorflow as tf

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import json
import pickle
import pandas as pd
import joblib

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

%pylab inline

### get data

In [None]:
cat ./ml-100k/README

In [None]:
#remove broken symbols
! iconv -f utf-8 -t utf-8 -c ml-100k/u.item >  ml-100k/u.item2

## user part 

In [None]:
! head -3 ./ml-100k/u.user

In [None]:
df_user = pd.read_csv('./ml-100k/u.user', sep='|', names='user id | age | gender | occupation | zip code'.split(' | '))
df_user['living_area'] = df_user['zip code'].map(lambda x: x[0])
del df_user['zip code']
df_user.head()

In [None]:
df_user

In [None]:
# encode features

In [None]:
features_list = ['age', 'gender', 'occupation', 'living_area']
s_users = []
le = LabelEncoder()

users_mat = []
for feature in features_list:
    col = le.fit_transform(df_user[feature].values)
    users_mat.append(col)
    s_users.append(len(le.classes_))
users_mat = np.array(users_mat).T
print(users_mat.shape)

In [None]:
s_users

In [None]:
users = {}
for i, id in enumerate(df_user['user id'].values):
    users[id] = users_mat[i]

In [None]:
users

## item part 

In [None]:
! head -3 ./ml-100k/u.item2

In [None]:
df_item = pd.read_csv('./ml-100k/u.item2', 
                      sep='|', 
                      names=(['id', 'title', 'release_date', 'video_release_date', 'url'] + 
                             ['g{}'.format(i) for i in range(19)])
                     )
df_item['year'] = df_item['release_date'].map(lambda x: str(x).split('-')[-1])

In [None]:
df_item

In [None]:
res = []
for age in list(map(str, df_item['year'].values)):
    if age == 'nan':
        age='1600'
    res.append(int(round(int(age), -1)))
df_item['decade'] = res

In [None]:
features_list = ['decade'] + ['g{}'.format(i) for i in range(19)]
s_item = []

items_mat = []
for feature in features_list:
    col = le.fit_transform(df_item[feature].values)
    items_mat.append(col)
    s_item.append(len(le.classes_))
items_mat = np.array(items_mat).T
print(items_mat.shape)

In [None]:
items = {}
for i, id in enumerate(df_item['id'].values):
    items[id] = items_mat[i]

## ratings 

In [None]:
! head -3 ./ml-100k/u.data

In [None]:
df_data = pd.read_csv('./ml-100k/u.data', 
                      sep='\t', 
                      names='user id | item id | rating | timestamp'.split(' | ')
                     )

In [None]:
df_data

In [None]:
df_data['target'] = df_data['rating'] > 4.5
data = df_data[['user id', 'item id']].to_numpy()
target = df_data['target'].values
print('Mean target: {}'.format(np.mean(target==True)))

In [None]:
data[0]

In [None]:
# split to pos/neg samples
positive_idx = np.where(target==True)[0]
negative_idx = np.where(target!=True)[0]

In [None]:
from sklearn.model_selection import train_test_split
pos_idx_tr, pos_idx_te = train_test_split(positive_idx, random_state=42, test_size=0.5)
neg_idx_tr, neg_idx_te = train_test_split(negative_idx, random_state=42, train_size=len(pos_idx_tr))

In [None]:
pos_idx_tr

In [None]:
def build_matrix(pos_idx, neg_idx):
    rows_user = []
    rows_item = []
    rows_pair = []
    for idx in list(pos_idx) + list(neg_idx):
        u, i = data[idx]
        # values should be 1-based 
        rows_user.append(users[u] + 1)
        rows_item.append(items[i] + 1)
        # u and i already 1-based
        rows_pair.append(data[idx])
    X = np.hstack(map(np.array, [rows_user, rows_pair, rows_item]))
    Y = np.zeros(len(pos_idx) + len(neg_idx))
    Y[:len(pos_idx)] = 1
    perm = np.random.permutation(X.shape[0])
    return X[perm], Y[perm]

In [None]:
n_users = 943
n_items = 1682


X_tr, Y_tr = build_matrix(pos_idx_tr, neg_idx_tr)
X_te, Y_te = build_matrix(pos_idx_te, neg_idx_te)

# sizes of categorical features
s_features = s_users + [n_users, n_items] + s_item

# Apply CP-decomposition predicyion model

In [None]:
#### The dataset was obtained after the same preprocessing of the original MovieLens 100k dataset performed in Exponential Machines (Novikov et al. 2016)
#### to allow for direct comparison.
#### Code adapted from Exponential Machines (2016) Novikov et al.
#### url:https://github.com/Bihaqo/exp-machines
import matplotlib.pyplot as plt 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from CP_Machine import CP_Machine,simple_batcher
import time

In [None]:
num_features=len(s_features)
s_features=[7,2,21,19,943,1682,10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]

In [None]:
x_train, x_test = X_tr, X_te
y_train, y_test = Y_tr, Y_te

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
##### Logistic Regression #####
oh = OneHotEncoder()
oh.fit(np.vstack((x_train, x_test))-1)
X_tr_sp = oh.transform(x_train-1)
X_te_sp = oh.transform(x_test-1)
logreg = LogisticRegression()
logreg.fit(X_tr_sp, y_train)
y_pred = logreg.predict_proba(X_te_sp)[:, 1]
print(roc_auc_score(y_test, y_pred))

In [None]:
##### CP_Machine #####


cp_rank = 30
num_features = len(s_features)
w_cores = [None] * num_features
begin_feature = [0] + list(np.cumsum(s_features)) #### where each feature begins in the big d vector

coef = logreg.coef_[0]
intercept = logreg.intercept_[0]

In [None]:
for i in range(num_features):
    local_dim = s_features[i]
 
    tmp = np.zeros((local_dim+1,cp_rank))
    tmp[0,:num_features]=1
    tmp[0,i]=intercept/num_features   
    tmp[1:s_features[i]+1,i]= coef[begin_feature[i]:begin_feature[i]+s_features[i]]

    w_cores[i] = tmp.astype(np.float32)

In [None]:
epoch_hist=[]

#### init_std=0.001, rank=30, reg=0.00005, expreg=3.6,  auc=0.7863

model = CP_Machine(rank=cp_rank, s_features=s_features, init_std=0.001, reg=0.00005, exp_reg=3.6)
model.init_from_cores(w_cores)
model.build_graph()
model.initialize_session()

In [None]:
start=time.time()
for epoch in range(20):
    loss_hist = []
    penalty_hist = []
    for x, y in simple_batcher(x_train, y_train, 256):
        fd = {model.X: x, model.Y: 2*y-1}
        run_ops = [model.trainer, model.outputs, model.loss, model.penalty, model.G]

        _, outs, batch_loss, penalty,weights = model.session.run(run_ops, fd)

        loss_hist.append(batch_loss)
        penalty_hist.append(penalty)
    epoch_train_loss = np.mean(loss_hist)
    epoch_train_pen = np.mean(penalty_hist)
    
#    epoch_stats = {'epoch': epoch,'train_logloss': float(epoch_train_loss)}
    epoch_stats = {'train_MSE': float(epoch_train_loss)}

     # test phase
#    if epoch%1==0 and epoch>0:
    fd = {model.X: x_test, model.Y: 2*y_test-1}
    run_ops = [model.outputs, model.loss, model.penalty, model.penalized_loss]

    outs, raw_loss, raw_penalty, loss = model.session.run(run_ops, fd)
    epoch_test_loss = roc_auc_score(y_test, outs)
    epoch_stats['test_auc'] = float(epoch_test_loss),
    #epoch_stats['penalty'] = float(raw_penalty)
    print('{}: te_auc: {:.4f}'.format(epoch, epoch_test_loss))

    epoch_hist.append(epoch_stats)
end=time.time()

In [None]:
pd.DataFrame(epoch_hist).plot(figsize=(8, 5))
plt.grid(True)
plt.show()
print('Training Time:{}'.format(end-start))

# Apply Exponetial Machines model

In [None]:
from TFExpMachine import TFExpMachine, simple_batcher

## Estimate the W tensor cores

In [None]:
target_rank = 10

num_features = len(s_features)
w_cores = [None] * num_features

coef = logreg.coef_[0]
intercept = logreg.intercept_[0]

# see paper for details about initialization
begin_feature = [0] + list(np.cumsum(s_features))
for i in range(num_features):
    n_factors = s_features[i]
    if i == 0:
        tmp = np.zeros((n_factors+1, 1, target_rank))
        for local_j, global_j in enumerate([-1] + list(range(begin_feature[i], s_features[i]))):
            if local_j==0:
                tmp[local_j,:1,:2] = [1, 0]
            else:
                tmp[local_j,:1,:2] = [0, coef[global_j]]
        w_cores[i] = tmp.astype(np.float32)
            
    elif i == num_features-1:
        tmp = np.zeros((n_factors+1, target_rank, 1))
        for local_j, global_j in enumerate([-1] + list(range(begin_feature[i], s_features[i]))):
            if local_j==0:
                tmp[local_j,:2,:1] = np.array([[intercept], [1]])
            else:
                tmp[local_j,:2,:1] = [[coef[global_j]], [0]]
        w_cores[i] = tmp.astype(np.float32)
            
    else:
        tmp = np.zeros((n_factors+1, target_rank, target_rank))
        for local_j, global_j in enumerate([-1] + list(range(begin_feature[i], s_features[i]))):
            if local_j==0:
                tmp[local_j,:2,:2] = np.eye(2)
            else:
                tmp[local_j,:2,:2] = [[0, coef[global_j]], [0,0]]
        w_cores[i] = tmp.astype(np.float32)

## initialize model

In [None]:
model.destroy()
model = TFExpMachine(rank=target_rank, s_features=s_features, init_std=0.001, reg=0.012, exp_reg=1.8)
model.init_from_cores(w_cores)
model.build_graph()
model.initialize_session()

## Learning  

In [None]:
epoch_hist = []
for epoch in range(50):
    # train phase
    loss_hist = []
    penalty_hist = []
    for x, y in simple_batcher(x_train, y_train, 256):
        fd = {model.X: x, model.Y: 2*y-1}
        run_ops = [model.trainer, model.outputs, model.loss, model.penalty]
        _, outs, batch_loss, penalty = model.session.run(run_ops, fd)
        loss_hist.append(batch_loss)
        penalty_hist.append(penalty)
    epoch_train_loss = np.mean(loss_hist)
    epoch_train_pen = np.mean(penalty_hist)
    
    epoch_stats = {
        'epoch': epoch,
        'train_logloss': float(epoch_train_loss)
    }
    
    # test phase
    if epoch%2==0 and epoch>0:
        fd = {model.X: x_test, model.Y: 2*y_test-1}
        run_ops = [model.outputs, model.loss, model.penalty, model.penalized_loss]
        outs, raw_loss, raw_penalty, loss = model.session.run(run_ops, fd)
        epoch_test_loss = roc_auc_score(y_test, outs)
        epoch_stats['test_auc'] = float(epoch_test_loss),
        epoch_stats['penalty'] = float(raw_penalty)
        print('{}: te_auc: {:.4f}'.format(epoch, epoch_test_loss))
    epoch_hist.append(epoch_stats)

In [None]:
%pylab inline
plot([x['epoch'] for x in epoch_hist if 'test_auc' in x], [x['test_auc'] for x in epoch_hist if 'test_auc' in x])
grid()
ylim(0.775, 0.785)
xlabel('epoch')
ylabel('test auc')