# Imports

In [4]:
import re
import pandas as pd
from collections import defaultdict
import csv
from sklearn.linear_model import LinearRegression
# %load_ext tensorboard
from datetime import datetime
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as Layer
import tensorboard
import matplotlib.pyplot as plt
import numpy as np
# import keras_tuner as kt
import time
import tensorflow.keras.layers.experimental.preprocessing as preprocessing

# Data Formatting & Manipulation

In [5]:

movieFields = ['id', 'title', 'vote_average', 'vote_count', 'genres']
castFields = ['id', 'cast']

movieData = pd.read_csv('archive/movies_metadata.csv', skipinitialspace=True, usecols=movieFields)
castData = pd.read_csv('archive/credits.csv', skipinitialspace=True, usecols=castFields)

dfMovies = pd.DataFrame(movieData)
dfCast = pd.DataFrame(castData)

df = pd.merge(dfMovies, dfCast, how='inner')


# ---------------------------------------------------------------------------- #
# The following section formats the cast and creates a clean list of actor names.

unformattedNames = df['cast']

# main list of cast grouped by movies
cast_master_copy = []

# list of overall cast by individual names
actor_name_list = []

# reads through the cast string for each movie
for unformattedNameString in unformattedNames:

    # creates new list for current movie
    grouped = []

    # pulls actors' names out of the paragraph
    line_list = re.findall("(?<=\'name\': )(.*?)(?=,)", unformattedNameString)
    
    # converts list of cast for one movie into a string to manipulate further 
    line_string = str(line_list)

    # separates cast into a unique list per movie
    line_split = line_string.split(", ")

    actor_count = 0
    # reads through each actor per movie
    for name in line_split:
        if (actor_count == 5):
            break
     
        # removes extraneous symbols from the actors' names
        name = name.replace("\"", "")
        name = name.replace("\'", "")
        name = name.replace("[", "")
        name = name.replace("]", "")

        # adds actors to two working lists
        # grouped: actors are in lists by movies
        # actor_name_list: puts every actors in one list
        grouped.append(name)
        actor_name_list.append(name)
        
        actor_count += 1

    # adds formatted cast members to list
    cast_master_copy.append(grouped)

# assigns formatted cast to 'cast' column of DataFrame
df['cast'] = cast_master_copy
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section turns the list of actor names into a list of unique numbers
# assigned to those actors. Then, creates a dictionary with the actor names and those 
# keys.

# initializes default dictionary
key_assignment = defaultdict(lambda: len(key_assignment))

# list of unique keys for each actor in dictionary
keys = [key_assignment[key] for key in actor_name_list]

# assigns keys to actors to form a hash table...nums are keys
actor_dict = dict(zip(keys, actor_name_list))

# inverts dictionary...names are keys
actor_dict_inv = {v: k for k, v in actor_dict.items()}
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section uses the inverted actor dictionary to look up the actor names
# and put their respective dictionary values in a list for each movie.

cast_names_by_movie = df['cast']
cast_ids_by_movie_master = []

for cast in cast_names_by_movie:

    temp_actor_list = []

    for actor in cast:
        actor = actor.replace("[", "")
        actor = actor.replace("]", "")
        actor = actor.replace("\'", "")

        actor_key = actor_dict_inv.get(actor)
        temp_actor_list.append(actor_key)
    
    if (len(temp_actor_list) < 5):
            diff = 5 - len(temp_actor_list)
            
            for i in range(diff):
                temp_actor_list.append(1)

    cast_ids_by_movie_master.append(temp_actor_list)

df['cast_ids'] = cast_ids_by_movie_master
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section formats the genre and adds it to the data frame

unformatted_genres = df['genres']

# main list of genres grouped by movies
genre_master_copy = []

# list of overall cast by individual names
genre_name_list = []

# reads through the genre string for each movie
for unformatted_genre_string in unformatted_genres:

    # creates new list for current movie
    genre_grouped = []

    # pulls genres out of the paragraph
    genre_line_list = re.findall("(?<=\'name\': )(.*?)(?=})", unformatted_genre_string)

    # converts list of genres for one movie into a string to manipulate further 
    genre_line_string = str(genre_line_list)

    # separates genre into a unique list per movie
    genre_line_split = genre_line_string.split(", ")

    # reads through each genre per movie & grabs the first one
    for genre in genre_line_split:
     
        # removes extraneous symbols from the genres
        genre = genre.replace("\"", "")
        genre = genre.replace("\'", "")
        genre = genre.replace("[", "")
        genre = genre.replace("]", "")

        genre_str = str(genre)
        break

    # adds formatted genre to list
    genre_master_copy.append(genre_str)

# assigns formatted cast to 'cast' column of DataFrame
df['genres'] = genre_master_copy
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section filters out data points that will be harmful for
# our model's accuracy.

# gets rid of all movies with 0 votes
# Number of movies before operation: 43020
# Number of movies after operation: 40739
df = df.drop(df[df.vote_count < 10].index)
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section handles the creation of the .csv files that will be the
# primary data used to train the models. 

# writes formatted DataFrame values to a new csv
# pd.DataFrame.to_csv(df, "formatted_movies.csv", index="false")

# creates a csv of the actor dictionary
# w = csv.writer(open("actor_dict.csv", "w"))
# for key, val in actor_dict.items():
#     w.writerow([key, val])
# ---------------------------------------------------------------------------- #

# print(df)


# Preparing Training Dataset

In [6]:
np.random.seed(1)
tf.random.set_seed(1)

movie_df = df
movie_df.drop(['vote_count'], axis = 1)

# saves genres keys/values if needed for lookup later
keys, genres = pd.factorize(movie_df['genres'])

# print(genres.shape)
# movie_df['genres'] = pd.factorize(movie_df['genres'])[0]

data_count = movie_df.shape[0]
training_data_count = int(data_count * 0.75) + 1
test_data_count = data_count - training_data_count

y_train = np.array(movie_df['vote_average'].head(training_data_count))
x_train = np.array(movie_df[['genres']].head(training_data_count))

y_test = np.array(movie_df['vote_average'].tail(test_data_count))
x_test = np.array(movie_df[['genres']].tail(test_data_count))

print(x_train)
print(x_train.shape)

# print(x_test)
# print(x_test.shape)

[['Horror']
 ['Action']
 ['Action']
 ...
 ['Crime']
 ['Drama']
 ['Adventure']]
(16989, 1)


# Text Vectorization (Genres)

In [7]:
VOCAB_SIZE = 20
x_test_list = x_test.tolist()
encoder = preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(x_test_list)

vocab = np.array(encoder.get_vocabulary())
vocab

Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2021-11-05 18:26:57.700127: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-05 18:26:57.700273: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-11-05 18:26:57.765731: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-05 18:26:57.765895: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-11-05 18:26:57.813211: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


array(['', '[UNK]', 'drama', 'comedy', 'action', 'adventure', 'horror',
       'crime', 'thriller', 'fantasy', 'animation', 'romance', 'mystery',
       'documentary', 'science', 'fiction', 'family', 'music', 'western',
       'war'], dtype='<U11')

## Text Vectorization Sanity Check (Genres)

In [8]:
encoded_example = encoder(movie_df.genres.to_numpy()[0]).numpy()
print(movie_df.genres.to_numpy()[0])
encoded_example

Horror


array([6])

In [9]:
x_train = np.reshape(x_train, (-1, 1))
y_train = np.reshape(y_train, (-1, 1))



print(x_train.shape)

print(y_train.shape)


x = np.array(encoder.get_vocabulary())

x.shape


(16989, 1)
(16989, 1)


(20,)

# Feed Forward Neural Network

In [20]:

model_RNN = tf.keras.Sequential([
 encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])




# # Sequential Model
# feed_forward_model = keras.models.Sequential([
    
#     encoder,
#     tf.keras.layers.Embedding(
#         input_dim=len(encoder.get_vocabulary()),
#         output_dim=64,
#         # Use masking to handle the variable sequence lengths
#         mask_zero=True),
    
#     # keras.layers.Dense(30, activation='relu'),
#     # keras.layers.Dense(50, activation='relu'),
#     keras.layers.Dense(1, activation='relu')
# ])
# # feed_forward_model.build(input_shape=(16989, 1))

model_RNN.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])


# feed_forward_model.compile(
#     optimizer='adam',
#     loss='mean_squared_error',
#     metrics=['accuracy'])

model_RNN.summary()


# # Define the Keras TensorBoard callback.
# logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

# # Train the model.
model_RNN.fit(
    x_train,
    y_train, 
    batch_size=512,
    epochs=25)




# # Evaluate
score = model_RNN.evaluate(x_train, y_train)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, None)              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, None, 64)          1280      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 64)                8256      
_________________________________________________________________
dense_18 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 65        
Total params: 13,761
Trainable params: 13,761
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25


2021-11-05 18:43:19.698653: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
 16/531 [..............................] - ETA: 3s - loss: 1.1778 - accuracy: 0.0000e+00

2021-11-05 18:43:39.708981: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Test loss: 1.1304889917373657
Test accuracy: 0.0
