# IMDB

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

import keras

from keras import backend as K
from keras import regularizers
from keras.utils import np_utils
from keras.models import Model, Sequential
from keras.optimizers import SGD, Adam
from keras.layers import Input, Dense, Dropout, Flatten, BatchNormalization, Masking, Activation

import matplotlib.pyplot as plt

In [None]:
def custom_loss(y_true,y_pred):
    y_mask=keras.backend.clip(y_true, 0, 0.01)*100
    return K.mean(K.square(y_mask*(y_pred - y_true)), axis=-1)


## Download the data

In [None]:
X_train = np.load("../data/X_train.npy", allow_pickle=True)
y_train = np.load("../data/y_train.npy", allow_pickle=True)
X_test = np.load("../data/X_test.npy", allow_pickle=True)
y_test = np.load("../data/y_test.npy", allow_pickle=True)
ratedlist = np.load("../data/ratedlist.npy", allow_pickle=True)

In [None]:
input_img = Input(shape=(X_train.shape[1],)) # All movies

encoded = Masking(mask_value=0)(input_img)

encoded = Dense(1000, activation='relu')(encoded)
encoded = Dense(500, activation='relu')(encoded)
encoded = Dense(120, activation='relu')(encoded)

decoded = Dense(500, activation='relu')(encoded)
decoded = Dense(1000, activation='relu')(decoded)
decoded = Dense(y_train.shape[1], activation='sigmoid')(decoded) # All movies

autoencoder = Model(input_img, decoded)
autoencoder.summary()

In [None]:
autoencoder.compile(loss=custom_loss, optimizer='adam')

In [None]:
split = int(X_train.shape[0] * 0.8)

In [None]:
epochs = 15
batchsize = 128

autoencoder.fit(X_train, y_train, epochs=epochs, batch_size=batchsize, validation_data=(X_test, y_test))
# autoencoder.fit(X_train[:split], y_train[:split], epochs=epochs, batch_size=batchsize, validation_data=(X_test[split:], y_test[split:]))

In [None]:
y_predict = autoencoder.predict(X_test)

In [None]:
SE = 0 # SquaredError

for r in range(ratedlist.shape[0]):
    
    amount_of_ratings = np.max(np.where(ratedlist[r] > 0)[0])
    
    diff = np.sum(
        (y_test[r, ratedlist[r]] * 5 - y_predict[r, ratedlist[r]] * 5) ** 2
    ) / amount_of_ratings
    SE += diff

In [None]:
RMSE = np.sqrt(SE / ratedlist.shape[0])
print(RMSE)

In [None]:
print("Actual ratings")
print(y_test[0, ratedlist[0]] * 5)
print("\nPredicted ratings")
print(y_predict[0, ratedlist[0]] * 5 )

In [None]:
import os

In [None]:
os.mkdir('../snapshots')

In [None]:
autoencoder.save("../snapshots/imdb-model.h5")

## Upload this model to the cloud

## Make sure this file is executable in one run