# IMDB

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

import keras

from keras import backend as K
from keras import regularizers
from keras.utils import np_utils
from keras.models import Model, Sequential
from keras.optimizers import SGD, Adam
from keras.layers import Input, Dense, Dropout, Flatten, BatchNormalization, Masking, Activation

import matplotlib.pyplot as plt

In [24]:
def custom_loss(y_true,y_pred):
    y_mask=keras.backend.clip(y_true, 0, 0.01)*100
    return K.mean(K.square(y_mask*(y_pred - y_true)), axis=-1)


## Download the data

In [25]:
X_train = np.load("../data/preprocessed_data/X_train.npy", allow_pickle=True)
y_train = np.load("../data/preprocessed_data/y_train.npy", allow_pickle=True)
X_test = np.load("../data/preprocessed_data/X_test.npy", allow_pickle=True)
y_test = np.load("../data/preprocessed_data/y_test.npy", allow_pickle=True)
ratedlist = np.load("../data/preprocessed_data/ratedlist.npy", allow_pickle=True)

print(X_train)

[[0.032 0.    0.032 ... 0.    0.    0.   ]
 [0.    0.    0.    ... 0.    0.    0.   ]
 [0.    0.    0.    ... 0.    0.    0.   ]
 ...
 [0.02  0.016 0.016 ... 0.    0.    0.   ]
 [0.024 0.    0.    ... 0.    0.    0.   ]
 [0.04  0.    0.    ... 0.    0.    0.   ]]


In [26]:
input_img = Input(shape=(X_train.shape[1],)) # All movies

encoded = Masking(mask_value=0)(input_img)

encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(50, activation='relu')(encoded)
encoded = Dense(12, activation='relu')(encoded)

decoded = Dense(50, activation='relu')(encoded)
decoded = Dense(12, activation='relu')(decoded)
decoded = Dense(y_train.shape[1], activation='sigmoid')(decoded) # All movies

autoencoder = Model(input_img, decoded)
autoencoder.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 193609)]          0         
_________________________________________________________________
masking_1 (Masking)          (None, 193609)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                12391040  
_________________________________________________________________
dense_7 (Dense)              (None, 50)                3250      
_________________________________________________________________
dense_8 (Dense)              (None, 12)                612       
_________________________________________________________________
dense_9 (Dense)              (None, 50)                650       
_________________________________________________________________
dense_10 (Dense)             (None, 12)                612 

In [27]:
autoencoder.compile(loss=custom_loss, optimizer='SGD')

In [28]:
split = int(X_train.shape[0] * 0.8)

In [29]:
epochs = 15
batchsize = 128

autoencoder.fit(X_train, y_train, epochs=epochs, batch_size=batchsize, validation_data=(X_test, y_test))
# autoencoder.fit(X_train[:split], y_train[:split], epochs=epochs, batch_size=batchsize, validation_data=(X_test[split:], y_test[split:]))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x26f3c4f5040>

In [30]:
y_predict = autoencoder.predict(X_test)
print(y_predict)

[[0.49999982 0.49999988 0.50000006 ... 0.49999988 0.5000002  0.4999997 ]
 [0.49999976 0.5        0.5        ... 0.5        0.5000001  0.49999964]
 [0.49999976 0.5        0.49999994 ... 0.5        0.5000002  0.49999982]
 ...
 [0.49999976 0.49999988 0.5        ... 0.5        0.50000024 0.4999997 ]
 [0.49999988 0.5        0.50000006 ... 0.49999988 0.50000006 0.49999976]
 [0.49999988 0.49999976 0.5000001  ... 0.49999982 0.5000002  0.49999964]]


In [32]:
SE = 0 # SquaredError

for r in range(ratedlist.shape[0]):
    
    amount_of_ratings = np.max(np.where(ratedlist[r] > 0)[0])
    
    diff = np.sum(
        (y_test[r, ratedlist[r]] * 5 - y_predict[r, ratedlist[r]] * 5) ** 2
    ) / amount_of_ratings
    SE += diff
print(SE)

3443.311072179884


In [33]:
RMSE = np.sqrt(SE / ratedlist.shape[0])
print(RMSE)

2.375872944724785


In [34]:
print("Actual ratings")
print(y_test[0, ratedlist[0]] * 5)
print("\nPredicted ratings")
print(y_predict[0, ratedlist[0]] * 5 )

Actual ratings
[0.16 0.16 0.16 0.2  0.2  0.12 0.2  0.16 0.2  0.2  0.2  0.2  0.12 0.2
 0.16 0.2  0.12 0.12 0.2  0.16 0.16 0.2  0.16 0.12 0.16 0.2  0.16 0.12
 0.2  0.16 0.16 0.2  0.16 0.16 0.16 0.2  0.2  0.12 0.2  0.12 0.16 0.12
 0.12 0.16 0.2  0.2  0.2  0.16 0.2  0.12 0.2  0.2  0.2  0.2  0.12 0.2
 0.2  0.16 0.2  0.16 0.2  0.2  0.2  0.16 0.2  0.2  0.16 0.2  0.2  0.2
 0.2  0.2  0.16 0.2  0.2  0.16 0.08 0.2  0.2  0.2  0.2  0.2  0.2  0.12
 0.16 0.2  0.2  0.2  0.2  0.2  0.2  0.16 0.12 0.12 0.12 0.12 0.16 0.16
 0.2  0.16 0.2  0.12 0.2  0.2  0.16 0.2  0.12 0.12 0.2  0.16 0.16 0.2
 0.16 0.16 0.2  0.2  0.16 0.16 0.2  0.16 0.2  0.16 0.2  0.16 0.2  0.16
 0.2  0.2  0.2  0.12 0.2  0.16 0.16 0.16 0.2  0.2  0.2  0.2  0.2  0.16
 0.2  0.16 0.16 0.08 0.16 0.16 0.2  0.2  0.08 0.2  0.16 0.2  0.08 0.2
 0.16 0.12 0.2  0.16 0.2  0.2  0.16 0.16 0.2  0.12 0.2  0.2  0.2  0.2
 0.2  0.16 0.08 0.16 0.16 0.2  0.16 0.16 0.2  0.12 0.2  0.2  0.2  0.2
 0.16 0.16 0.2  0.2  0.2  0.16 0.2  0.2  0.2  0.2  0.2  0.16 0.2  0.2

In [15]:
import os

In [16]:
os.mkdir('../snapshots')

In [17]:
autoencoder.save("../snapshots/imdb-model.h5")

## Upload this model to the cloud

## Make sure this file is executable in one run