# Imports

In [84]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as Layer
import numpy as np
# import keras_tuner as kt
import tensorflow.keras.layers.experimental.preprocessing as preprocessing
# from DatasetFormatting import *
import statistics


In [104]:
import re
import pandas as pd
from collections import defaultdict


movieFields = ['id', 'title', 'vote_average', 'vote_count', 'genres']
castFields = ['id', 'cast']

movieData = pd.read_csv('archive/movies_metadata.csv', skipinitialspace=True, usecols=movieFields)
castData = pd.read_csv('archive/credits.csv', skipinitialspace=True, usecols=castFields)

dfMovies = pd.DataFrame(movieData)
dfCast = pd.DataFrame(castData)

df = pd.merge(dfMovies, dfCast, how='inner')


# ---------------------------------------------------------------------------- #
# The following section formats the cast and creates a clean list of actor names.

unformattedNames = df['cast']

# main list of cast grouped by movies
cast_master_copy = []

# list of overall cast by individual names
actor_name_list = []

# reads through the cast string for each movie
for unformattedNameString in unformattedNames:

    # creates new list for current movie
    grouped = ""

    # pulls actors' names out of the paragraph
    line_list = re.findall("(?<=\'name\': )(.*?)(?=,)", unformattedNameString)
    
    # converts list of cast for one movie into a string to manipulate further 
    line_string = str(line_list)

    # separates cast into a unique list per movie
    line_split = line_string.split(", ")

    actor_count = 0
    # reads through each actor per movie
    for name in line_split:
        if (actor_count == 5):
            break
     
        # removes extraneous symbols from the actors' names
        name = name.replace("\"", "")
        name = name.replace("\'", "")
        name = name.replace("[", "")
        name = name.replace("]", "")
        name = name.replace(" ", "")

        # adds actors to two working lists
        # grouped: actors are in lists by movies
        # actor_name_list: puts every actors in one list
        grouped += name + ' '
        actor_name_list.append(name)
        
        actor_count += 1

    # adds formatted cast members to list
    grouped = grouped[:-1]
    cast_master_copy.append(grouped)

# assigns formatted cast to 'cast' column of DataFrame
df['cast'] = cast_master_copy
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section formats the genre and adds it to the data frame

unformatted_genres = df['genres']

# main list of genres grouped by movies
genre_master_copy = []

# list of overall cast by individual names
genre_name_list = []

# reads through the genre string for each movie
for unformatted_genre_string in unformatted_genres:

    # creates new list for current movie
    genre_grouped = []

    # pulls genres out of the paragraph
    genre_line_list = re.findall("(?<=\'name\': )(.*?)(?=})", unformatted_genre_string)

    # converts list of genres for one movie into a string to manipulate further 
    genre_line_string = str(genre_line_list)

    # separates genre into a unique list per movie
    genre_line_split = genre_line_string.split(", ")

    # reads through each genre per movie & grabs the first one
    for genre in genre_line_split:
     
        # removes extraneous symbols from the genres
        genre = genre.replace("\"", "")
        genre = genre.replace("\'", "")
        genre = genre.replace("[", "")
        genre = genre.replace("]", "")

        genre_str = str(genre)
        break

    # adds formatted genre to list
    genre_master_copy.append(genre_str)

# assigns formatted cast to 'cast' column of DataFrame
df['genres'] = genre_master_copy
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section appends the genre to the cast string. This preparation is 
# needed for text vectorization / embedding.

final_format_col = []
cast = df['cast'].to_list()
genres = df['genres'].to_list()

i = 0
count = len(cast)
for i in range(count):
    final_string = cast[i] + " " + genres[i]
    final_format_col.append(final_string)

df.insert(loc=0, column='training_col', value=final_format_col)
df.training_col = final_format_col
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section filters out data points that will be harmful for
# our model's accuracy and drops unnecessary columns.

# gets rid of all movies with 0 votes
# Number of movies before operation: 43019
# Number of movies after operation: 22651
df = df.drop(df[df.vote_count < 10].index)
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section handles the creation of the .csv files that will be the
# primary data used to train the models. 

# writes formatted DataFrame values to a new csv
pd.DataFrame.to_csv(df, "formatted_movies.csv", index="false")

# creates a csv of the actor dictionary
# w = csv.writer(open("actor_dict.csv", "w"))
# for key, val in actor_dict.items():
#     w.writerow([key, val])
# ---------------------------------------------------------------------------- #

# Preparing Training Dataset

In [154]:
np.random.seed(1)
tf.random.set_seed(1)

movie_df = df
movie_df.drop(['vote_count'], axis = 1)

data_count = movie_df.shape[0]
training_data_count = int(data_count * 0.75) + 1
test_data_count = data_count - training_data_count

y_train = np.array(movie_df['vote_average'].head(training_data_count))
x_train = np.array(movie_df['training_col'].head(training_data_count))

y_test = np.array(movie_df['vote_average'].tail(test_data_count))
x_test = np.array(movie_df[['training_col']].tail(test_data_count))

std = statistics.stdev(movie_df['vote_average'].to_list())
mean = statistics.mean(movie_df['vote_average'].to_list())
variance = statistics.variance(movie_df['vote_average'].to_list())
mode = statistics.mode(movie_df['vote_average'].to_list())

print(f"STD: {std}")
print(f"mean: {mean}")
print(f"variance: {variance}")
print(f"mode: {mode}")

# print(x_train)
print(f"x_train shape: {x_train.shape}")

STD: 0.9976277124059775
mean: 6.165617412034789
variance: 0.9952610525603838
mode: 6.5
x_train shape: (16989,)


## Text Vectorization For Actors & Genres

In [103]:
VOCAB_SIZE = 201277
encoder = preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
features = np.array(df['training_col'])
encoder.adapt(features)

print(np.array(encoder.get_vocabulary()))

2021-11-06 01:04:43.343430: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


['' '[UNK]' 'drama' ... 'aamnashariff' 'aamirbashir' 'aadukalamnaren']


## Text Vectorization Sanity Check

In [88]:
print(movie_df['training_col'].to_numpy()[0])
print(encoder(movie_df['training_col'].to_numpy()[0]).numpy())

DakotaFanning JosePabloCantillo Horror
[ 325 3016    5]


# Recurrent Neural Network

In [89]:
model_RNN = tf.keras.Sequential([
 encoder,
    Layer.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=100,
        mask_zero=True),
    Layer.SimpleRNN(100),
    Layer.Dense(200, activation='relu'),
    Layer.Dense(100, activation='relu'),
    Layer.Dense(1)
])

model_RNN.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

model_RNN.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_4 (TextVe (None, None)              0         
_________________________________________________________________
embedding_12 (Embedding)     (None, None, 100)         4126500   
_________________________________________________________________
simple_rnn_12 (SimpleRNN)    (None, 100)               20100     
_________________________________________________________________
dense_26 (Dense)             (None, 200)               20200     
_________________________________________________________________
dense_27 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 101       
Total params: 4,187,001
Trainable params: 4,187,001
Non-trainable params: 0
___________________________________________

## RNN Training & Validation

In [90]:
model_RNN.fit(
    x_train,
    y_train, 
    batch_size=512,
    epochs=5,
    verbose=4)

score = model_RNN.evaluate(x_train, y_train)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/5


2021-11-06 00:51:49.796307: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  5/531 [..............................] - ETA: 7s - loss: 5.1549 - accuracy: 0.0000e+00  

2021-11-06 00:52:01.476930: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Test loss: 1.8976799249649048
Test accuracy: 0.0


In [150]:
print(df.loc[df['title'] == 'Sully']['vote_average'])

sully_input = 'TomHanks AaronEckhart LauraLinney AnnaGunn AutumnReeser Drama'
validation_prediction = model_RNN.predict(np.array([sully_input]))
print(validation_prediction[0])

1839    7.0
Name: vote_average, dtype: float64
[6.103124]
