# Purpose:
This notebook contains the code that is use to generate weights for a "Status predictor" classifier neural net. 
This treats the status of "Completed" as a target, and all other status "Dropped, On Hold, Watcing, Backlog" as a 0. 



In [3]:
#!pip install keras # Because this has been run on several different docker images, need to make sure keras is installed. 



In [72]:
!nvidia-smi # Need to check to confirm that the GPU memory is not in use from another process. 

Thu Sep  7 18:12:55 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.69                 Driver Version: 384.69                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  Off  | 00000000:04:00.0 Off |                  N/A |
| 20%   37C    P8    18W / 250W |  10691MiB / 11170MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID  Type  Process name                               Usage    

In [73]:
import numpy as np
from keras import backend as K
from keras.layers import Input, Embedding, merge
import keras.layers
from keras.regularizers import l2, l1
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.models import Sequential, Model
from keras.optimizers import SGD, RMSprop, Adam
from IPython.display import SVG
#from keras.utils.vis_utils import model_to_dot
#import pydotplus as pydot 
#import graphviz
#from keras.utils import plot_model


In [4]:
import pandas as pd

In [5]:
ratings_score_train = pd.read_csv('mal_scores_train_nonzero_v2.csv')# Time to load in all of the data. 
ratings_score_test = pd.read_csv('mal_scores_test_nonzero_v2.csv')# the v2 version of this data has no "test" users who aren't present in the training data. 
ratings_no_score_train = pd.read_csv('mal_scores_train_zero.csv') 
ratings_no_score_test = pd.read_csv('mal_scores_test_zero.csv') 

In [7]:
ratings_score_train.shape

(10914898, 18)

In [9]:
ratings_score_test.shape

(1210873, 18)

In [10]:
ratings_no_score_test.shape

(1071354, 11)

In [11]:
ratings_no_score_train.shape

(9632967, 15)

In [None]:
ratings_score_train.head() #Check to make sure the data is good. 

In [None]:
scores = ratings_score_train[['animeid','userid','status']]
noscore = ratings_no_score_train[['animeid','userid','status']]

In [None]:
concatlist = [scores,noscore]
data_train = pd.concat(concatlist,ignore_index=True) # I only care about anime, user and status, so I need to take those from the two sources and merge them. 

In [None]:
test_scores = ratings_score_test[['animeid','userid','status']]
test_noscores =ratings_no_score_test[['animeid','userid','status']]
concatlist2 = [test_scores,test_noscores]
data_test = pd.concat(concatlist2,ignore_index=True) # Creating the test set to judge the classifier against. 

In [None]:
data_train['target']=data_train['status']=="COMPLETED"
data_train['target']=data_train['target'].astype(int) # turning completed in 1s everyhing else is 0

In [None]:
data_test['target']=data_test['status']=="COMPLETED"
data_test['target']=data_test['target'].astype(int) # making the target variable for the test set. 

Now we need to re-order and dataprep.


In [None]:
#userid mapping
users1 = ratings_score_train.userid.unique()
users2 = ratings_score_test.userid.unique()
users3 = ratings_no_score_train.userid.unique()
#animeid mapping
anime1= ratings_score_train.animeid.unique()
anime2=ratings_score_test.animeid.unique()
anime3=ratings_no_score_train.animeid.unique()
#statusid mapping
status = ratings_score_train.status.unique() #only need to do this on one of the dataframes because there are only 6 status, and a status cannot be missing. 
n_status = ratings_score_train.status.nunique()

In [None]:
users = set(users1).union(set(users2)).union(set(users3))
animes = set(anime1).union(set(anime2)).union(set(anime3))


In [None]:
userid2idx=np.load("user.npy").item()
animeid2idx=np.load("anime.npy").item()
#From prior parts of the model, the user and anime embedding id is stored in this dictionary, to make sure that anime have the same embedding when they are loaded into the recommendation engine.  

In [None]:
n_users = len(userid2idx)
n_animes = len(animeid2idx) # setting up the variables for the model later. 

In [None]:
data_test['anime_id_emb'] = data_test.animeid.apply(lambda x: animeid2idx[x])
data_train['anime_id_emb'] = data_train.animeid.apply(lambda x: animeid2idx[x])


In [None]:
data_test['user_id_emb'] = data_test.userid.apply(lambda x: userid2idx[x])
data_train['user_id_emb'] = data_train.userid.apply(lambda x: userid2idx[x])


In [None]:
n_factors = 36 #The number of hidden factors each user and anime is transformed into.  

In [None]:
data_train.head()# making sure we are ready to model. 

In [None]:
# Check the baseline to know what the model's baseline accuracy is. 
data_train['target'].value_counts()

In [None]:
data_train.shape

In [None]:
def embedding_input_anime1(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_Anime_Hidden_Factors')(inp)
def embedding_input_user1(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg),name='Embed_User_Hidden_Factors')(inp)

user_in1, u1 = embedding_input_user1('user_id_in', n_users+15, n_factors, 1e-6) # the extra 15 is headroom for unused embeddings in the recommendation stae. 
anime_in1, a1 = embedding_input_anime1('anime_id_in', n_animes, n_factors, 1e-6)

In [None]:
# nn1_ for status prediction
x = merge([u1, a1], mode='concat', name='All_Factors_on_one_layer')
x = Flatten()(x)
#x = Dropout(0.55, name='Prevent_overfit')(x) # this layer made the model worse. 
x = Dense(70, activation='relu',name='Random_HF_Interactions')(x)
x = Dropout(0.35,name='Prevent_overfit2')(x)
x = Dense(16, activation='relu',name='Random_HF_Interactions2')(x)
x = Dropout(0.1, name='Prevent_overfit')(x)
x = Dense(1,name='Final_Interactions',activation='sigmoid')(x)

nn1 = Model([user_in1, anime_in1], x)
nn1.compile(Adam(0.001), loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
# now let's fit the model.
nn1.fit([data_train.user_id_emb, data_train.anime_id_emb], data_train.target, batch_size=5120, epochs=5, 
          validation_data=([data_test.user_id_emb, data_test.anime_id_emb], data_test.target))

In [None]:
 nn1.lr=.0005

In [None]:
nn1.fit([data_train.user_id_emb, data_train.anime_id_emb], data_train.target, batch_size=5120, epochs=9, 
          validation_data=([data_test.user_id_emb, data_test.anime_id_emb], data_test.target))

In [None]:
nn1.fit([data_train.user_id_emb, data_train.anime_id_emb], data_train.target, batch_size=10120, epochs=3, 
          validation_data=([data_test.user_id_emb, data_test.anime_id_emb], data_test.target))

In [None]:
nn1.save_weights('nn_score_weights_pred.h5') # and with this save, the model is complete, and these weights are ready for the recommender. 