As part of a slide deck presenting my recommendation system, I presented a "Hipster" variation of my model. 
This notebook exists as documentation of that variation, in case reproducibility is/was required. 


In [84]:
!nvidia-smi

Mon Sep  4 18:02:20 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.69                 Driver Version: 384.69                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  Off  | 00000000:04:00.0 Off |                  N/A |
| 20%   46C    P8    19W / 250W |  10691MiB / 11170MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID  Type  Process name                               Usage    

In [85]:
import numpy as np
from keras import backend as K
from keras.layers import Input, Embedding, merge
import keras.layers
from keras.regularizers import l2, l1
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.models import Sequential, Model
from keras.optimizers import SGD, RMSprop, Adam
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
#import pydotplus as pydot 
#import graphviz
from keras.utils import plot_model
from sklearn.metrics import mean_absolute_error

In [86]:
import pandas as pd

In [87]:
ratings_score_train = pd.read_csv('mal_scores_train_nonzero_v2.csv')# 
ratings_score_test = pd.read_csv('mal_scores_test_nonzero_v2.csv')# the v2 version of this data has no "test" users who aren't present in the training data. 
ratings_no_score_train = pd.read_csv('mal_scores_train_zero.csv') 

Now we need to re-order and dataprep.


In [88]:
ratings_score_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,animeid,animescore,index,name,status,unkey,userid,score,useravg,animeavg,score_anime_scaled,score_usr_scaled,user_rev_count,anime_rev_count
0,0,0,13202,13202,2001,10,Lancerevomr,Lancerevomr,COMPLETED,2001Lancerevomr,Lancerevomr,10,10.0,8.681996,1.318004,0.0,1,36141
1,1,1,21990,21990,10719,10,kuelpanda,kuelpanda,COMPLETED,10719kuelpanda,kuelpanda,10,9.5,7.535472,2.464528,0.5,2,19734
2,2,2,21991,21991,10155,9,kuelpanda,kuelpanda,COMPLETED,10155kuelpanda,kuelpanda,9,9.5,7.105895,1.894105,-0.5,2,4495
3,3,3,22409,22409,15051,10,vxxyman,vxxyman,COMPLETED,15051vxxyman,vxxyman,10,10.0,7.654129,2.345871,0.0,1,8694
4,4,4,23792,23792,1,10,pinheiro,pinheiro,COMPLETED,1pinheiro,pinheiro,10,10.0,8.804023,1.195977,0.0,1,29136


In [89]:
#userid mapping
users1 = ratings_score_train.userid.unique()
users2 = ratings_score_test.userid.unique()
users3 = ratings_no_score_train.userid.unique()
#animeid mapping
anime1= ratings_score_train.animeid.unique()
anime2=ratings_score_test.animeid.unique()
anime3=ratings_no_score_train.animeid.unique()
#statusid mapping
status = ratings_score_train.status.unique() #only need to do this on one of the dataframes because there are only 6 status, and a status cannot be missing. 
n_status = ratings_score_train.status.nunique()

In [90]:
users = set(users1).union(set(users2)).union(set(users3))

In [91]:
userid2idx=np.load("user.npy").item()
animeid2idx=np.load("anime.npy").item()
#userid2idx = {o:i for i,o in enumerate(users)} # turn the user strings into a numerical value. 

In [92]:
len(userid2idx)

230962

In [93]:
animes = set(anime1).union(set(anime2)).union(set(anime3))

In [94]:
#animeid2idx = {o:i for i,o in enumerate(animes)} # remove missing anime numbers and re-order

In [95]:
np.save("user.npy", userid2idx)
np.save("anime.npy", animeid2idx)


In [96]:
status2idx = {o:i for i,o in enumerate(status)} # turn the watched status into a vector 

In [97]:
n_users = len(users)
n_animes = len(animes)

In [98]:
ratings_score_test['anime_id_emb'] = ratings_score_test.animeid.apply(lambda x: animeid2idx[x])
ratings_score_train['anime_id_emb'] = ratings_score_train.animeid.apply(lambda x: animeid2idx[x])
ratings_no_score_train['anime_id_emb'] = ratings_no_score_train.animeid.apply(lambda x: animeid2idx[x])

In [99]:
ratings_score_test['user_id_emb'] = ratings_score_test.userid.apply(lambda x: userid2idx[x])
ratings_score_train['user_id_emb'] = ratings_score_train.userid.apply(lambda x: userid2idx[x])
ratings_no_score_train['user_id_emb'] = ratings_no_score_train.userid.apply(lambda x: userid2idx[x])

In [100]:
ratings_score_test['status_emb'] = ratings_score_test.status.apply(lambda x: status2idx[x])
ratings_score_train['status_emb'] = ratings_score_train.status.apply(lambda x: status2idx[x])
ratings_no_score_train['status_emb'] = ratings_no_score_train.status.apply(lambda x: status2idx[x])

In [101]:
n_factors = 28 #changing this number changes how many hidden factors each user and each anime is transformed into. 

In [102]:
ratings_score_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,animeid,animescore,index,name,status,unkey,...,score,useravg,animeavg,score_anime_scaled,score_usr_scaled,user_rev_count,anime_rev_count,anime_id_emb,user_id_emb,status_emb
0,0,0,13202,13202,2001,10,Lancerevomr,Lancerevomr,COMPLETED,2001Lancerevomr,...,10,10.0,8.681996,1.318004,0.0,1,36141,1822,220189,0
1,1,1,21990,21990,10719,10,kuelpanda,kuelpanda,COMPLETED,10719kuelpanda,...,10,9.5,7.535472,2.464528,0.5,2,19734,6344,53804,0
2,2,2,21991,21991,10155,9,kuelpanda,kuelpanda,COMPLETED,10155kuelpanda,...,9,9.5,7.105895,1.894105,-0.5,2,4495,6092,53804,0
3,3,3,22409,22409,15051,10,vxxyman,vxxyman,COMPLETED,15051vxxyman,...,10,10.0,7.654129,2.345871,0.0,1,8694,7213,96692,0
4,4,4,23792,23792,1,10,pinheiro,pinheiro,COMPLETED,1pinheiro,...,10,10.0,8.804023,1.195977,0.0,1,29136,0,218194,0


In [103]:
ratings_score_train=ratings_score_train[(ratings_score_train['user_rev_count']<1800) & (ratings_score_train['user_rev_count']>30)]

In [104]:
ratings_score_train=ratings_score_train[ratings_score_train['score_usr_scaled']!=0]

In [105]:
ratings_score_train = ratings_score_train[ratings_score_train['anime_rev_count']<8000]

In [106]:
# these are the embedding functions that will be used for all models. For code modularity (the ability to cut paste a model to another notebook to test several at the same time), we're going to redfine this each time. 


In [107]:
def embedding_input_anime1(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_Anime_Hidden_Factors')(inp)
def embedding_input_user1(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_User_Hidden_Factors')(inp)

user_in1, u1 = embedding_input_user1('user_id_in', n_users+15, n_factors, 1e-7)
anime_in1, a1 = embedding_input_anime1('anime_id_in', n_animes, n_factors, 1e-8)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [108]:
# nn1, It is given no modifications for status. It is predicting score
x = merge([u1, a1], mode='concat', name='All_Factors_on_one_layer')
x = Flatten()(x)

x = Dense(70, activation='relu',name='Random_HF_Interactions')(x)
x = Dropout(0.55,name='Prevent_overfit2')(x)
x = Dense(16, activation='relu',name='Random_HF_Interactions2')(x)
x = Dropout(0.1, name='Prevent_overfit')(x)
x = Dense(1,name='Final_Interactions')(x)
nn1 = Model([user_in1, anime_in1], x)
nn1.compile(Adam(0.001), loss='mse')

  
  name=name)


In [109]:
nn1.fit([ratings_score_train.user_id_emb, ratings_score_train.anime_id_emb], ratings_score_train.score, batch_size=5120, epochs=5, 
          validation_data=([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb], ratings_score_test.score))

Train on 5059414 samples, validate on 1210873 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5493e80f28>

In [110]:
nn1.fit([ratings_score_train.user_id_emb, ratings_score_train.anime_id_emb], ratings_score_train.score, batch_size=15120, epochs=8, 
          validation_data=([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb], ratings_score_test.score))

Train on 5059414 samples, validate on 1210873 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f54949685c0>

In [111]:
nn1pred= nn1.predict([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb])
nn1targ = ratings_score_test.score.values
mean_absolute_error(nn1targ, nn1pred)

1.1747645709476917

In [112]:
nn1.save_weights('nn_score_weights_hip.h5')

In [113]:
#plot_model(nn1, to_file='score.png')
#plot_model(nn12, to_file='user_score.png')
#plot_model(nn13, to_file='anime_score.png')

The NN2 Branch.
This drop all scores for shows that are not  listed as complete.


In [114]:
nn1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_id_in (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
anime_id_in (InputLayer)         (None, 1)             0                                            
____________________________________________________________________________________________________
Embed_User_Hidden_Factors (Embed (None, 1, 28)         6433588     user_id_in[0][0]                 
____________________________________________________________________________________________________
Embed_Anime_Hidden_Factors (Embe (None, 1, 28)         359128      anime_id_in[0][0]                
___________________________________________________________________________________________

In [115]:
ratings_score_train_complete= ratings_score_train

In [116]:
def embedding_input_anime22(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_Anime_Hidden_Factors')(inp)
def embedding_input_user22(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_User_Hidden_Factors')(inp)

user_in22, u22 = embedding_input_user22('user_id_in', n_users+15, n_factors, 1e-7)
anime_in22, a22 = embedding_input_anime22('anime_id_in', n_animes, n_factors, 1e-8)
# nn22, It is only given complete. Trying to predict usr scaled score. 
x = merge([u22, a22], mode='concat', name='All_Factors_on_one_layer')
x = Flatten()(x)
x = Dense(70, activation='relu',name='Random_HF_Interactions')(x)
x = Dropout(0.55,name='Prevent_overfit2')(x)
x = Dense(16, activation='relu',name='Random_HF_Interactions2')(x)
x = Dropout(0.1, name='Prevent_overfit')(x)
x = Dense(1,name='Final_Interactions')(x)
nn22 = Model([user_in22, anime_in22], x)
nn22.compile(Adam(0.001), loss='mse')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  # This is added back by InteractiveShellApp.init_path()
  name=name)


In [117]:
nn22.fit([ratings_score_train_complete.user_id_emb, ratings_score_train_complete.anime_id_emb], ratings_score_train_complete.score_usr_scaled, batch_size=5120, epochs=5, 
          validation_data=([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb], ratings_score_test.score_usr_scaled))

Train on 5059414 samples, validate on 1210873 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5490be3be0>

In [118]:
nn22.fit([ratings_score_train_complete.user_id_emb, ratings_score_train_complete.anime_id_emb], ratings_score_train_complete.score_usr_scaled, batch_size=15120, epochs=8, 
          validation_data=([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb], ratings_score_test.score_usr_scaled))

Train on 5059414 samples, validate on 1210873 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f547ddc6fd0>

In [119]:
nn22.save_weights('nn_score_usr_weights_hip.h5')

In [120]:
nn22pred= nn22.predict([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb])
nn22targ = ratings_score_test.score_usr_scaled.values
mean_absolute_error(nn22targ, nn22pred)

1.1105441995939027

In [121]:
def embedding_input_anime23(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_Anime_Hidden_Factors')(inp)
def embedding_input_user23(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_User_Hidden_Factors')(inp)

user_in23, u23 = embedding_input_user23('user_id_in', n_users+15, n_factors, 1e-7)
anime_in23, a23 = embedding_input_anime23('anime_id_in', n_animes, n_factors, 1e-8)
# nn23, It is only given complete. Trying to predict anime scaled score. 
x = merge([u23, a23], mode='concat', name='All_Factors_on_one_layer')
x = Flatten()(x)
x = Dense(70, activation='relu',name='Random_HF_Interactions')(x)
x = Dropout(0.55,name='Prevent_overfit2')(x)
x = Dense(16, activation='relu',name='Random_HF_Interactions2')(x)
x = Dropout(0.1, name='Prevent_overfit')(x)
x = Dense(1,name='Final_Interactions')(x)
nn23 = Model([user_in23, anime_in23], x)
nn23.compile(Adam(0.001), loss='mse')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  # This is added back by InteractiveShellApp.init_path()
  name=name)


In [122]:
nn23.fit([ratings_score_train_complete.user_id_emb, ratings_score_train_complete.anime_id_emb], ratings_score_train_complete.score_anime_scaled, batch_size=5120, epochs=5, 
          validation_data=([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb], ratings_score_test.score_anime_scaled))

Train on 5059414 samples, validate on 1210873 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f54769b4dd8>

In [123]:
nn23.fit([ratings_score_train_complete.user_id_emb, ratings_score_train_complete.anime_id_emb], ratings_score_train_complete.score_anime_scaled, batch_size=15120, epochs=8, 
          validation_data=([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb], ratings_score_test.score_anime_scaled))

Train on 5059414 samples, validate on 1210873 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f547b140a90>

In [124]:
nn23.save_weights('nn_score_anime_weights_hip.h5')

In [125]:
nn23pred= nn23.predict([ratings_score_test.user_id_emb, ratings_score_test.anime_id_emb])
nn23targ = ratings_score_test.score_anime_scaled.values
mean_absolute_error(nn23targ, nn23pred)

1.0111344778400806