# Imports

In [23]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as Layer
import numpy as np
import tensorflow.keras.layers.experimental.preprocessing as preprocessing
from DatasetFormatting import *
import statistics


# Preparing Training Dataset

In [25]:
np.random.seed(1)
tf.random.set_seed(1)

movie_df = df
movie_df.drop(['vote_count'], axis = 1)

data_count = movie_df.shape[0]
training_data_count = int(data_count * 0.75) + 1
test_data_count = data_count - training_data_count

y_train = np.array(movie_df['vote_average'].head(training_data_count))
x_train = np.array(movie_df['training_col'].head(training_data_count))

y_test = np.array(movie_df['vote_average'].tail(test_data_count))
x_test = np.array(movie_df[['training_col']].tail(test_data_count))

std = statistics.stdev(movie_df['vote_average'].to_list())
mean = statistics.mean(movie_df['vote_average'].to_list())
variance = statistics.variance(movie_df['vote_average'].to_list())
mode = statistics.mode(movie_df['vote_average'].to_list())

print(f"STD: {std}")
print(f"mean: {mean}")
print(f"variance: {variance}")
print(f"mode: {mode}")

# print(x_train)
print(f"x_train shape: {x_train.shape}")

STD: 0.9976277124059775
mean: 6.165617412034789
variance: 0.9952610525603838
mode: 6.5
x_train shape: (16989,)


## Text Vectorization For Actors & Genres

In [26]:
VOCAB_SIZE = 201277
encoder = preprocessing.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=6)
features = np.array(df['training_col'])
encoder.adapt(features)

# this line of code doesnt work on windows for some reason
# print(np.array(encoder.get_vocabulary()))

2021-11-11 14:20:44.166799: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


## Text Vectorization Sanity Check

In [27]:
print(movie_df['training_col'].to_numpy()[0])
x = encoder(movie_df['training_col'].to_numpy()[0]).numpy()
x

Horror DakotaFanning JosePabloCantillo


array([   5,  325, 3016,    0,    0,    0])

# Feed Forward Neural Network

In [35]:
print(x_train.shape)

model_FFNN = tf.keras.Sequential([
encoder,
    Layer.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=200),
    Layer.Flatten(input_shape=(16989, 1)),
    Layer.Dense(100),
    Layer.Dense(1)
])

model_FFNN.compile(loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(1e-4),
            metrics=['mse'])


(16989,)


In [38]:
model_FFNN.fit(
    x_train,
    y_train, 
    batch_size=256,
    epochs=10,
    verbose=4)

score = model_FFNN.evaluate(x_test, y_test)
print('Test loss:', score[0])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 1.447055459022522


In [None]:
print(df.loc[df['title'] == 'Sully']['vote_average'])

sully_input = 'TomHanks AaronEckhart LauraLinney AnnaGunn AutumnReeser Drama'
validation_prediction = model_FFNN.predict(np.array([sully_input]))
print(validation_prediction[0])

In [39]:
def print_test_prediction_ffnn(index):
    print(f"Prediction for index: {index}")
    
    print(f"input ---> {x_test[index]}")
    print(f"actual output ---> {y_test[index]}")
    print(f"predicted output ---> {model_FFNN.predict(x_test[index])}\n")
        
print_test_prediction_ffnn(0)
print_test_prediction_ffnn(2)
print_test_prediction_ffnn(3)
print_test_prediction_ffnn(4)
print_test_prediction_ffnn(5)
print_test_prediction_ffnn(6)

Prediction for index: 0
input ---> ['Drama DannyGlover RonPerlman LindaHamilton ZoeWeizenbaum DavidStrathairn']
actual output ---> 6.1
predicted output ---> [[6.439771]]

Prediction for index: 2
input ---> ['Comedy MatthewMcConaughey JeffreyNordling WillieNelson WoodyHarrelson ZacharyKnighton']
actual output ---> 4.7
predicted output ---> [[6.232257]]

Prediction for index: 3
input ---> ['Drama PaulWalker PiperPerabo LambertWilson LindaCardellini ShawnHatosy']
actual output ---> 6.0
predicted output ---> [[6.5984507]]

Prediction for index: 4
input ---> ['Comedy MollyShannon WillFerrell ElaineHendrix HarlandWilliams TomGreen']
actual output ---> 5.0
predicted output ---> [[4.65206]]

Prediction for index: 5
input ---> ['Drama StephenDorff ArminMueller-Stahl AloisMoyo MorganFreeman IanRoberts']
actual output ---> 7.6
predicted output ---> [[5.719355]]

Prediction for index: 6
input ---> ['Comedy HarryDeanStanton EmilioEstevez TraceyWalter OliviaBarash SyRichardson']
actual output ---> 6

# Recurrent Neural Network

In [29]:
model_RNN = tf.keras.Sequential([
encoder,
    Layer.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=200),
    Layer.SimpleRNN(100),
    Layer.Dense(100),
    Layer.Dense(1)
])

model_RNN.compile(loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(1e-4),
            metrics=['mse'])

## RNN Training & Validation

In [30]:
model_RNN.fit(
    x_train,
    y_train, 
    batch_size=256,
    epochs=10,
    verbose=4)

score = model_RNN.evaluate(x_test, y_test)
print('Test loss:', score[0])

Epoch 1/10


2021-11-11 14:20:44.889729: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 13/177 [=>............................] - ETA: 1s - loss: 0.8483 - mse: 0.8483

2021-11-11 14:21:09.764115: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Test loss: 1.014055609703064


### Validating Training Data

In [31]:
print(df.loc[df['title'] == 'Sully']['vote_average'])

sully_input = 'TomHanks AaronEckhart LauraLinney AnnaGunn AutumnReeser Drama'
validation_prediction = model_RNN.predict(np.array([sully_input]))
print(validation_prediction[0])

1839    7.0
Name: vote_average, dtype: float64
[4.884875]


2021-11-11 14:21:11.462852: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


### Validating Test Data

In [32]:
def print_test_prediction_rnn(index):
    print(f"Prediction for index: {index}")
    
    print(f"input ---> {x_test[index]}")
    print(f"actual output ---> {y_test[index]}")
    print(f"predicted output ---> {model_RNN.predict(x_test[index])}\n")
        
print_test_prediction_rnn(0)
print_test_prediction_rnn(2)
print_test_prediction_rnn(3)
print_test_prediction_rnn(4)
print_test_prediction_rnn(5)
print_test_prediction_rnn(6)

Prediction for index: 0
input ---> ['Drama DannyGlover RonPerlman LindaHamilton ZoeWeizenbaum DavidStrathairn']
actual output ---> 6.1
predicted output ---> [[6.00353]]

Prediction for index: 2
input ---> ['Comedy MatthewMcConaughey JeffreyNordling WillieNelson WoodyHarrelson ZacharyKnighton']
actual output ---> 4.7
predicted output ---> [[6.2993007]]

Prediction for index: 3
input ---> ['Drama PaulWalker PiperPerabo LambertWilson LindaCardellini ShawnHatosy']
actual output ---> 6.0
predicted output ---> [[6.8082037]]

Prediction for index: 4
input ---> ['Comedy MollyShannon WillFerrell ElaineHendrix HarlandWilliams TomGreen']
actual output ---> 5.0
predicted output ---> [[5.0957355]]

Prediction for index: 5
input ---> ['Drama StephenDorff ArminMueller-Stahl AloisMoyo MorganFreeman IanRoberts']
actual output ---> 7.6
predicted output ---> [[5.811267]]

Prediction for index: 6
input ---> ['Comedy HarryDeanStanton EmilioEstevez TraceyWalter OliviaBarash SyRichardson']
actual output --->