# Imports

In [23]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as Layer
import numpy as np
import tensorflow.keras.layers.experimental.preprocessing as preprocessing
from DatasetFormatting import *
import statistics


# Preparing Training Dataset

In [25]:
np.random.seed(1)
tf.random.set_seed(1)

movie_df = df
movie_df.drop(['vote_count'], axis = 1)

data_count = movie_df.shape[0]
training_data_count = int(data_count * 0.75) + 1
test_data_count = data_count - training_data_count

y_train = np.array(movie_df['vote_average'].head(training_data_count))
x_train = np.array(movie_df['training_col'].head(training_data_count))

y_test = np.array(movie_df['vote_average'].tail(test_data_count))
x_test = np.array(movie_df[['training_col']].tail(test_data_count))

std = statistics.stdev(movie_df['vote_average'].to_list())
mean = statistics.mean(movie_df['vote_average'].to_list())
variance = statistics.variance(movie_df['vote_average'].to_list())
mode = statistics.mode(movie_df['vote_average'].to_list())

print(f"STD: {std}")
print(f"mean: {mean}")
print(f"variance: {variance}")
print(f"mode: {mode}")

# print(x_train)
print(f"x_train shape: {x_train.shape}")

STD: 0.9976277124059775
mean: 6.165617412034789
variance: 0.9952610525603838
mode: 6.5
x_train shape: (16989,)


### Text Vectorization For Actors & Genres

In [26]:
VOCAB_SIZE = 201277
encoder = preprocessing.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=6)
features = np.array(df['training_col'])
encoder.adapt(features)

# this line of code doesnt work on windows for some reason
# print(np.array(encoder.get_vocabulary()))

2021-11-11 14:20:44.166799: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


### Text Vectorization Sanity Check

In [27]:
print(movie_df['training_col'].to_numpy()[0])
x = encoder(movie_df['training_col'].to_numpy()[0]).numpy()
x

Horror DakotaFanning JosePabloCantillo


array([   5,  325, 3016,    0,    0,    0])

<br><br><br><br><br><br><br><br><br><br>
# Feed Forward Neural Network

### Building The FFNN

In [56]:
model_FFNN = tf.keras.Sequential([
encoder,
    Layer.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=500),
    Layer.Flatten(input_shape=(16989, 1)),
    Layer.Dense(50),
    Layer.Dense(100),
    Layer.Dense(100),
    Layer.Dense(100),
    Layer.Dense(1)
])

model_FFNN.summary()

model_FFNN.compile(loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(1e-4))




Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_2 (TextVe (None, 6)                 0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 6, 500)            20632500  
_________________________________________________________________
flatten_4 (Flatten)          (None, 3000)              0         
_________________________________________________________________
dense_29 (Dense)             (None, 50)                150050    
_________________________________________________________________
dense_30 (Dense)             (None, 100)               5100      
_________________________________________________________________
dense_31 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_32 (Dense)             (None, 100)             

### Training The FFNN

In [55]:
model_FFNN.fit(
    x_train,
    y_train, 
    batch_size=512,
    epochs=5,
    verbose=10)

score = model_FFNN.evaluate(x_test, y_test)
print('Test loss:', score)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 1.6835546493530273


### Validating FFNN On **Train** Data

In [59]:
def print_train_prediction_ffnn(index):
    print(f"Prediction for index: {index}")
    
    print(f"input ---> {x_train[index]}")
    print(f"actual output ---> {y_train[index]}")
    print(f"predicted output ---> {model_FFNN.predict(x_test[index])}\n")

print_train_prediction_ffnn(0)
print_train_prediction_ffnn(1)
print_train_prediction_ffnn(2)
print_train_prediction_ffnn(3)
print_train_prediction_ffnn(4)

Prediction for index: 0
input ---> Horror DakotaFanning JosePabloCantillo
actual output ---> 7.2
predicted output ---> [[-0.04434437]]

Prediction for index: 1
input ---> Action AdamRayner ElizaDushku IanOgilvy JamesRemar RogerMoore
actual output ---> 5.8
predicted output ---> [[0.05607103]]

Prediction for index: 2
input ---> Action SteveBoyle NicRhind RobertHobbs ChrisWilliamMartin TylerJohnston
actual output ---> 7.3
predicted output ---> [[0.05618138]]

Prediction for index: 3
input ---> Action SigourneyWeaver EugeneKhumbanyiwa RobertHobbs CarlyPope BrandonAuret
actual output ---> 7.4
predicted output ---> [[-0.05545556]]

Prediction for index: 4
input ---> Action AdriannePalicki MichaelJaiWhite TyOlsson PascaleHutton SamJaeger
actual output ---> 5.2
predicted output ---> [[0.01173999]]



2021-11-11 18:28:28.970278: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


### Validating FFNN On **Test** Data

In [None]:
def print_test_prediction_ffnn(index):
    print(f"Prediction for index: {index}")
    
    print(f"input ---> {x_test[index]}")
    print(f"actual output ---> {y_test[index]}")
    print(f"predicted output ---> {model_FFNN.predict(x_test[index])}\n")
        
print_test_prediction_ffnn(0)
print_test_prediction_ffnn(1)
print_test_prediction_ffnn(2)
print_test_prediction_ffnn(3)
print_test_prediction_ffnn(4)

<br><br><br><br><br><br><br><br><br><br>

# Recurrent Neural Network

### Building The RNN

In [60]:
model_RNN = tf.keras.Sequential([
encoder,
    Layer.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=500),
    Layer.SimpleRNN(100),
    Layer.Dense(100),
    Layer.Dense(1)
])

model_RNN.summary()

model_RNN.compile(loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(1e-4))

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_2 (TextVe (None, 6)                 0         
_________________________________________________________________
embedding_16 (Embedding)     (None, 6, 500)            20632500  
_________________________________________________________________
simple_rnn_13 (SimpleRNN)    (None, 100)               60100     
_________________________________________________________________
dense_38 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 101       
Total params: 20,702,801
Trainable params: 20,702,801
Non-trainable params: 0
_________________________________________________________________


### Training The RNN

In [61]:
model_RNN.fit(
    x_train,
    y_train, 
    batch_size=256,
    epochs=5,
    verbose=5)

score = model_RNN.evaluate(x_test, y_test)
print('Test loss:', score)

Epoch 1/5


2021-11-11 18:29:21.485776: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 12/177 [=>............................] - ETA: 1s - loss: 0.8012

2021-11-11 18:29:38.039624: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Test loss: 0.903617799282074


### Validating RNN On **Train** Data

In [62]:
def print_train_prediction_rnn(index):
    print(f"Prediction for index: {index}")
    
    print(f"input ---> {x_train[index]}")
    print(f"actual output ---> {y_train[index]}")
    print(f"predicted output ---> {model_RNN.predict(x_test[index])}\n")

print_train_prediction_rnn(0)
print_train_prediction_rnn(1)
print_train_prediction_rnn(2)
print_train_prediction_rnn(3)
print_train_prediction_rnn(4)


Prediction for index: 0
input ---> Horror DakotaFanning JosePabloCantillo
actual output ---> 7.2
predicted output ---> [[6.132244]]

Prediction for index: 1
input ---> Action AdamRayner ElizaDushku IanOgilvy JamesRemar RogerMoore
actual output ---> 5.8
predicted output ---> [[5.89359]]

Prediction for index: 2
input ---> Action SteveBoyle NicRhind RobertHobbs ChrisWilliamMartin TylerJohnston
actual output ---> 7.3
predicted output ---> [[6.7131147]]

Prediction for index: 3
input ---> Action SigourneyWeaver EugeneKhumbanyiwa RobertHobbs CarlyPope BrandonAuret
actual output ---> 7.4
predicted output ---> [[5.921199]]

Prediction for index: 4
input ---> Action AdriannePalicki MichaelJaiWhite TyOlsson PascaleHutton SamJaeger
actual output ---> 5.2
predicted output ---> [[5.2969275]]



2021-11-11 18:29:39.730935: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


### Validating RNN On **Test** Data

In [63]:
def print_test_prediction_rnn(index):
    print(f"Prediction for index: {index}")
    
    print(f"input ---> {x_test[index]}")
    print(f"actual output ---> {y_test[index]}")
    print(f"predicted output ---> {model_RNN.predict(x_test[index])}\n")
        
print_test_prediction_rnn(0)
print_test_prediction_rnn(1)
print_test_prediction_rnn(2)
print_test_prediction_rnn(3)
print_test_prediction_rnn(4)

Prediction for index: 0
input ---> ['Drama DannyGlover RonPerlman LindaHamilton ZoeWeizenbaum DavidStrathairn']
actual output ---> 6.1
predicted output ---> [[6.132244]]

Prediction for index: 1
input ---> ['Action Ky-ManiMarley SpraggaBenz PaulCampbell LouieRankin WyclefJean']
actual output ---> 7.4
predicted output ---> [[5.89359]]

Prediction for index: 2
input ---> ['Comedy MatthewMcConaughey JeffreyNordling WillieNelson WoodyHarrelson ZacharyKnighton']
actual output ---> 4.7
predicted output ---> [[6.7131147]]

Prediction for index: 3
input ---> ['Drama PaulWalker PiperPerabo LambertWilson LindaCardellini ShawnHatosy']
actual output ---> 6.0
predicted output ---> [[5.921199]]

Prediction for index: 4
input ---> ['Comedy MollyShannon WillFerrell ElaineHendrix HarlandWilliams TomGreen']
actual output ---> 5.0
predicted output ---> [[5.2969275]]



In [None]:
# print(df.loc[df['title'] == 'Sully']['vote_average'])

# sully_input = 'TomHanks AaronEckhart LauraLinney AnnaGunn AutumnReeser Drama'
# validation_prediction = model_FFNN.predict(np.array([sully_input]))
# print(validation_prediction[0])