# Data Formatting & Manipulation

In [1]:
import re
import pandas as pd
from collections import defaultdict
import csv
from sklearn.linear_model import LinearRegression

In [3]:

movieFields = ['id', 'title', 'vote_average', 'vote_count', 'budget', 'genres']
castFields = ['id', 'cast']

movieData = pd.read_csv('archive/movies_metadata.csv', skipinitialspace=True, usecols=movieFields)
castData = pd.read_csv('archive/credits.csv', skipinitialspace=True, usecols=castFields)

dfMovies = pd.DataFrame(movieData)
dfCast = pd.DataFrame(castData)

df = pd.merge(dfMovies, dfCast, how='inner')


# ---------------------------------------------------------------------------- #
# The following section formats the cast and creates a clean list of actor names.

unformattedNames = df['cast']

# main list of cast grouped by movies
cast_master_copy = []

# list of overall cast by individual names
actor_name_list = []

# reads through the cast string for each movie
for unformattedNameString in unformattedNames:

    # creates new list for current movie
    grouped = []

    # pulls actors' names out of the paragraph
    line_list = re.findall("(?<=\'name\': )(.*?)(?=,)", unformattedNameString)
    

    # converts list of cast for one movie into a string to manipulate further 
    line_string = str(line_list)

    # separates cast into a unique list per movie
    line_split = line_string.split(", ")

    # reads through each actor per movie
    for name in line_split:
     
        # removes extraneous symbols from the actors' names
        name = name.replace("\"", "")
        name = name.replace("\'", "")
        name = name.replace("[", "")
        name = name.replace("]", "")

        # adds actors to two working lists
        # grouped: actors are in lists by movies
        # actor_name_list: puts every actors in one list
        grouped.append(name)
        actor_name_list.append(name)

    # adds formatted cast members to list
    cast_master_copy.append(grouped)

# assigns formatted cast to 'cast' column of DataFrame
df['cast'] = cast_master_copy
# ---------------------------------------------------------------------------- #



# ---------------------------------------------------------------------------- #
# The following section turns the list of actor names into a list of unique numbers
# assigned to those actors. Then, creates a dictionary with the actor names and those 
# keys.

# initializes default dictionary
key_assignment = defaultdict(lambda: len(key_assignment))

# list of unique keys for each actor in dictionary
keys = [key_assignment[key] for key in actor_name_list]

# assigns keys to actors to form a hash table...nums are keys
actor_dict = dict(zip(keys, actor_name_list))

# inverts dictionary...names are keys
actor_dict_inv = {v: k for k, v in actor_dict.items()}
# ---------------------------------------------------------------------------- #



# ---------------------------------------------------------------------------- #
# The following section uses the inverted actor dictionary to look up the actor names
# and put their respective dictionary values in a list for each movie.

cast_names_by_movie = df['cast']
cast_ids_by_movie_master = []

for cast in cast_names_by_movie:

    temp_actor_list = []

    for actor in cast:
        actor = actor.replace("[", "")
        actor = actor.replace("]", "")
        actor = actor.replace("\'", "")

        actor_key = actor_dict_inv.get(actor)
        temp_actor_list.append(actor_key)

    cast_ids_by_movie_master.append(temp_actor_list)

df['cast_ids'] = cast_ids_by_movie_master
# ---------------------------------------------------------------------------- #



# ---------------------------------------------------------------------------- #
# The following section formats the genre and adds it to the data frame

unformatted_genres = df['genres']

# main list of genres grouped by movies
genre_master_copy = []

# list of overall cast by individual names
genre_name_list = []

# reads through the genre string for each movie
for unformatted_genre_string in unformatted_genres:

    # creates new list for current movie
    genre_grouped = []

    # pulls genres out of the paragraph
    genre_line_list = re.findall("(?<=\'name\': )(.*?)(?=})", unformatted_genre_string)

    # converts list of genres for one movie into a string to manipulate further 
    genre_line_string = str(genre_line_list)

    # separates genre into a unique list per movie
    genre_line_split = genre_line_string.split(", ")

    # reads through each genre per movie
    for genre in genre_line_split:
     
        # removes extraneous symbols from the genres
        genre = genre.replace("\"", "")
        genre = genre.replace("\'", "")
        genre = genre.replace("[", "")
        genre = genre.replace("]", "")

        # genres are in lists by movies
        genre_grouped.append(genre)

    # adds formatted genres to list
    genre_master_copy.append(genre_grouped)

# assigns formatted cast to 'cast' column of DataFrame
df['genres'] = genre_master_copy
# ---------------------------------------------------------------------------- #



# ---------------------------------------------------------------------------- #
# The following section filters out data points that will be harmful for
# our model's accuracy.

# gets rid of all movies with 0 votes
# Number of movies before operation: 43020
# Number of movies after operation: 40739
df = df.drop(df[df.vote_count < 10].index)


df = df.drop(df[df.budget < 1000].index)
# ---------------------------------------------------------------------------- #



# ---------------------------------------------------------------------------- #
# The following section handles the creation of the .csv files that will be the
# primary data used to train the models. 

# writes formatted DataFrame values to a new csv
pd.DataFrame.to_csv(df, "formatted_movies.csv", index="false")

# creates a csv of the actor dictionary
# w = csv.writer(open("actor_dict.csv", "w"))
# for key, val in actor_dict.items():
#     w.writerow([key, val])
# ---------------------------------------------------------------------------- #



# print(df)





# Feed Forward Neural Network

In [5]:
# Load the TensorBoard notebook extension.
# %load_ext tensorboard
from datetime import datetime
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as Layer
import tensorboard
import matplotlib.pyplot as plt
import numpy as np
import keras_tuner as kt
import time

In [28]:
np.random.seed(1)
tf.random.set_seed(1)

# columns for DataFrame
movieFields = ['id', 'title', 'vote_average', 'cast', 'cast_ids', 'budget']

# reads csv
movie_dataset = pd.read_csv('formatted_movies.csv', skipinitialspace=True, usecols=movieFields)

# creates DataFrame from movie_dataset
df = pd.DataFrame(movie_dataset)

arr = df.to_numpy()

ax = arr[:, [0]]
ay = arr[:, [3]]

# y_train = np.array(df['vote_average'])
# x_train = np.array(df[['budget']])



print(ax.shape)

(7413, 1)


In [29]:

# # Sequential Model
feed_forward_model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=(7413, 1)),
    keras.layers.Dense(10, activation='relu',),
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dense(1, activation='relu')
])

feed_forward_model.compile(
    optimizer='adam',
    loss='loss_mean_squared_error',
    metrics=['accuracy'])

feed_forward_model.summary()


# # Define the Keras TensorBoard callback.
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

# # Train the model.
feed_forward_model.fit(
    ax,
    ay, 
    batch_size=512,
    epochs=25,
    callbacks=[tensorboard_callback])

# # Evaluate
score = feed_forward_model.evaluate(ax, ay)
print('Test loss:', score[0])
print('Test accuracy:', score[1])









Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 7413)              0         
_________________________________________________________________
dense_24 (Dense)             (None, 10)                74140     
_________________________________________________________________
dense_25 (Dense)             (None, 20)                220       
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 21        
Total params: 74,381
Trainable params: 74,381
Non-trainable params: 0
_________________________________________________________________


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).