# Imports

In [1]:
! pip install keras-tuner
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as Layer
import numpy as np
import keras_tuner as kt
import tensorflow.keras.layers.experimental.preprocessing as preprocessing
import statistics


In [2]:
import re
import pandas as pd

movieFields = ['id', 'title', 'vote_average', 'vote_count', 'genres']
castFields = ['id', 'cast']

movieData = pd.read_csv('movies_metadata.csv', skipinitialspace=True, usecols=movieFields, engine='python')
castData = pd.read_csv('credits.csv', skipinitialspace=True, usecols=castFields, engine='python')

dfMovies = pd.DataFrame(movieData)
dfCast = pd.DataFrame(castData)

df = pd.merge(dfMovies, dfCast, how='inner')


# ---------------------------------------------------------------------------- #
# The following section formats the cast and creates a clean list of actor names.

unformattedNames = df['cast']

# main list of cast grouped by movies
cast_master_copy = []

# list of overall cast by individual names
actor_name_list = []

# reads through the cast string for each movie
for unformattedNameString in unformattedNames:

    # creates new list for current movie
    grouped = ""

    # pulls actors' names out of the paragraph
    line_list = re.findall("(?<=\'name\': )(.*?)(?=,)", unformattedNameString)
    
    # converts list of cast for one movie into a string to manipulate further 
    line_string = str(line_list)

    # separates cast into a unique list per movie
    line_split = line_string.split(", ")

    actor_count = 0
    # reads through each actor per movie
    for name in line_split:
        if (actor_count == 5):
            break
     
        # removes extraneous symbols from the actors' names
        name = name.replace("\"", "")
        name = name.replace("\'", "")
        name = name.replace("[", "")
        name = name.replace("]", "")
        name = name.replace(" ", "")

        # adds actors to two working lists
        # grouped: actors are in lists by movies
        # actor_name_list: puts every actors in one list
        grouped += name + ' '
        actor_name_list.append(name)
        
        actor_count += 1

    # adds formatted cast members to list
    grouped = grouped[:-1]
    cast_master_copy.append(grouped)

# assigns formatted cast to 'cast' column of DataFrame
df['cast'] = cast_master_copy
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section formats the genre and adds it to the data frame

unformatted_genres = df['genres']

# main list of genres grouped by movies
genre_master_copy = []

# list of overall cast by individual names
genre_name_list = []

# reads through the genre string for each movie
for unformatted_genre_string in unformatted_genres:

    # creates new list for current movie
    genre_grouped = []

    # pulls genres out of the paragraph
    genre_line_list = re.findall("(?<=\'name\': )(.*?)(?=})", unformatted_genre_string)

    # converts list of genres for one movie into a string to manipulate further 
    genre_line_string = str(genre_line_list)

    # separates genre into a unique list per movie
    genre_line_split = genre_line_string.split(", ")

    # reads through each genre per movie & grabs the first one
    for genre in genre_line_split:
     
        # removes extraneous symbols from the genres
        genre = genre.replace("\"", "")
        genre = genre.replace("\'", "")
        genre = genre.replace("[", "")
        genre = genre.replace("]", "")

        genre_str = str(genre)
        break

    # adds formatted genre to list
    genre_master_copy.append(genre_str)

# assigns formatted cast to 'cast' column of DataFrame
df['genres'] = genre_master_copy
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section appends the genre to the cast string. This preparation is 
# needed for text vectorization / embedding.

final_format_col = []
cast = df['cast'].to_list()
genres = df['genres'].to_list()

i = 0
count = len(cast)
for i in range(count):
    final_string = cast[i] + " " + genres[i]
    final_format_col.append(final_string)

df.insert(loc=0, column='training_col', value=final_format_col)
df.training_col = final_format_col
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section filters out data points that will be harmful for
# our model's accuracy and drops unnecessary columns.

# gets rid of all movies with 0 votes
# Number of movies before operation: 43019
# Number of movies after operation: 22651
df = df.drop(df[df.vote_count < 10].index)
# ---------------------------------------------------------------------------- #


# ---------------------------------------------------------------------------- #
# The following section handles the creation of the .csv files that will be the
# primary data used to train the models. 

# writes formatted DataFrame values to a new csv
pd.DataFrame.to_csv(df, "formatted_movies.csv", index="false")

# creates a csv of the actor dictionary
# w = csv.writer(open("actor_dict.csv", "w"))
# for key, val in actor_dict.items():
#     w.writerow([key, val])
# ---------------------------------------------------------------------------- #

np.random.seed(1)
tf.random.set_seed(1)

movie_df = df
movie_df.drop(['vote_count'], axis = 1)

data_count = movie_df.shape[0]
training_data_count = int(data_count * 0.75) + 1
test_data_count = data_count - training_data_count

y_train = np.array(movie_df['vote_average'].head(training_data_count))
x_train = np.array(movie_df['training_col'].head(training_data_count))

y_test = np.array(movie_df['vote_average'].tail(test_data_count))
x_test = np.array(movie_df[['training_col']].tail(test_data_count))

std = statistics.stdev(movie_df['vote_average'].to_list())
mean = statistics.mean(movie_df['vote_average'].to_list())
variance = statistics.variance(movie_df['vote_average'].to_list())
mode = statistics.mode(movie_df['vote_average'].to_list())


VOCAB_SIZE = 201277
encoder = preprocessing.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=6)
features = np.array(df['training_col'])
encoder.adapt(features)

# Feed Forward Network

In [6]:
!rm -rf untitled_project

def build_model(hp):
  
  model = tf.keras.Sequential()
  model.add(encoder)
  
  model.add(Layer.Embedding(
            input_dim=len(encoder.get_vocabulary()),
            # output_dim=hp.Int('embedding_neurons', min_value=400, max_value=500, step_size=50)
            output_dim=500)
            
            ))

  model.add(Layer.Flatten(input_shape=(16989, 1)))

  for i in range(hp.Int('num_hidden_layers', 1, 4)):
    model.add(Layer.Dense(hp.Int(f'dense{i}_neurons', min_value=0, max_value=100, step=20), activation='relu'))

  model.add(Layer.Dense(1, name='output'))

  # optimizer=hp.Choice('optimizer', values=['RMSprop'])
  optimizer = 'RMSprop'
  
  # tf.keras.optimizers.Adam(hp.Choice('learning_rate', [.01, .001, .0001]))
  tf.keras.optimizers.Adam(hp.Choice(.001)
  

  model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=optimizer,
              metrics=['mse'])
  
  return model


tuner = kt.RandomSearch(
build_model,
objective='val_loss',
max_trials=300,  # how many model variations to test?
executions_per_trial=1)  # how many trials per variation? (same model could perform differently

tuner.search_space_summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 6)                 0         
_________________________________________________________________
embedding (Embedding)        (None, 6, 450)            18569250  
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 200)               130200    
_________________________________________________________________
dense (Dense)                (None, 60)                12060     
_________________________________________________________________
output (Dense)               (None, 1)                 61        
Total params: 18,711,571
Trainable params: 18,711,571
Non-trainable params: 0
_________________________________________________________________
Search space summary
Default search space size: 5
embedding_neurons (Int)
{'default': None, 'conditions': [], 

## FFNN Training & Validation

In [None]:
tuner.search(x=x_train,
            y=y_train,    
            verbose=2,
            epochs=1,
            batch_size=512,
            callbacks=[],
            validation_data=(x_test, y_test))

tuner.results_summary()

### Keras-Tuner Hyperparameter Optimization
Go in Editing Mode to View

# Keras Tuner Findings

# Test 1
batch size 512
general test with many values to see how the model reacts to different nums of neurons

### Search Space

Search space summary
Default search space size: 4
embedding_neurons (Int)
{'default': None, 'conditions': [], 'min_value': 450, 'max_value': 550, 'step': 50, 'sampling': None}
dense1_neurons (Int)
{'default': None, 'conditions': [], 'min_value': 0, 'max_value': 400, 'step': 50, 'sampling': None}
optimizer (Choice)
{'default': 'adam', 'conditions': [], 'values': ['adam', 'RMSprop'], 'ordered': False}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


### Results

Trial 74 Complete [00h 00m 06s]
val_loss: 0.8427369594573975

Best val_loss So Far: 0.8339388370513916
Total elapsed time: 00h 11m 06s
INFO:tensorflow:Oracle triggered exit
Results summary
Results in ./untitled_project
Showing 10 best trials
Objective(name='val_loss', direction='min')

Trial summary
Hyperparameters:
embedding_neurons: 500
dense1_neurons: 250
optimizer: RMSprop
learning_rate: 0.001
Score: 0.8339388370513916

Trial summary
Hyperparameters:
embedding_neurons: 500
dense1_neurons: 250
optimizer: RMSprop
learning_rate: 0.01
Score: 0.8427369594573975

Trial summary
Hyperparameters:
embedding_neurons: 550
dense1_neurons: 300
optimizer: RMSprop
learning_rate: 0.01
Score: 0.844895601272583

Trial summary
Hyperparameters:
embedding_neurons: 550
dense1_neurons: 250
optimizer: RMSprop
learning_rate: 0.001
Score: 0.8483849763870239

Trial summary
Hyperparameters:
embedding_neurons: 500
dense1_neurons: 250
optimizer: RMSprop
learning_rate: 0.0001
Score: 0.8536829948425293

Trial summary
Hyperparameters:
embedding_neurons: 450
dense1_neurons: 300
optimizer: RMSprop
learning_rate: 0.01
Score: 0.8559091091156006

Trial summary
Hyperparameters:
embedding_neurons: 500
dense1_neurons: 300
optimizer: RMSprop
learning_rate: 0.0001
Score: 0.8602085113525391

Trial summary
Hyperparameters:
embedding_neurons: 500
dense1_neurons: 300
optimizer: RMSprop
learning_rate: 0.01
Score: 0.8614881634712219

Trial summary
Hyperparameters:
embedding_neurons: 550
dense1_neurons: 350
optimizer: RMSprop
learning_rate: 0.01
Score: 0.8623285889625549

Trial summary
Hyperparameters:
embedding_neurons: 500
dense1_neurons: 200
optimizer: RMSprop
learning_rate: 0.01
Score: 0.863603413105011



# Test 2 
batch size 512
This test I am keeping the embedding neurons constant and I am focusing on the number of hidden layers and hidden neurons.

### Search Space
Search space summary
Default search space size: 5
embedding_neurons (Int)
{'default': None, 'conditions': [], 'min_value': 500, 'max_value': 500, 'step': 1, 'sampling': None}
num_hidden_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': None}
dense0_neurons (Int)
{'default': None, 'conditions': [], 'min_value': 10, 'max_value': 210, 'step': 50, 'sampling': None}
optimizer (Choice)
{'default': 'adam', 'conditions': [], 'values': ['adam', 'RMSprop'], 'ordered': False}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}
### Results

Trial 132 Complete [00h 00m 21s]
val_loss: 0.7553905248641968

Best val_loss So Far: 0.7398236989974976
Total elapsed time: 01h 28m 48s

Search: Running Trial #133

Hyperparameter    |Value             |Best Value So Far 
embedding_neurons |500               |500               
num_hidden_layers |5                 |5                 
dense0_neurons    |110               |60                
optimizer         |RMSprop           |RMSprop           
learning_rate     |0.0001            |0.0001            
dense1_neurons    |160               |160               
dense2_neurons    |160               |160               
dense3_neurons    |160               |210               
dense4_neurons    |160               |160  

# Test 3
only changing the num of embedding neurons to find the best . looking from 100-1000 broadly

batch size 512
### Search Space
Search space summary
Default search space size: 1
embedding_neurons (Int) {'default': None, 'conditions': [], 'min_value': 100, 'max_value': 1000, 'step': 100, 'sampling': None}
### Results

Trial 8 Complete [00h 00m 02s]
val_loss: 1.7155439853668213

Best val_loss So Far: 0.8410091996192932
Total elapsed time: 00h 00m 58s
Showing 10 best trials
Objective(name='val_loss', direction='min')

Trial summary
Hyperparameters:
embedding_neurons: 500
Score: 0.8410091996192932

Trial summary
Hyperparameters:
embedding_neurons: 400
Score: 0.8689964413642883

Trial summary
Hyperparameters:
embedding_neurons: 700
Score: 0.8894386887550354

Trial summary
Hyperparameters:
embedding_neurons: 300
Score: 0.91794753074646

Trial summary
Hyperparameters:
embedding_neurons: 800
Score: 0.926380455493927

Trial summary
Hyperparameters:
embedding_neurons: 200
Score: 1.0575706958770752

Trial summary
Hyperparameters:
embedding_neurons: 900
Score: 1.0936203002929688

Trial summary
Hyperparameters:
embedding_neurons: 100
learning_rate: 0.001
Score: 1.7155439853668213

# Test 4
narrowing down on embedding neurons testing 400-700 step size 50

batch size 512
### Search Space
Search space summary
Default search space size: 1
embedding_neurons (Int) {'default': None, 'conditions': [], 'min_value': 400, 'max_value': 700, 'step': 50, 'sampling': None}
### Results
Trial 5 Complete [00h 00m 11s]
val_loss: 0.8536071181297302

Best val_loss So Far: 0.8405100107192993
Total elapsed time: 00h 00m 40s
INFO:tensorflow:Oracle triggered exit
Results summary
Hyperparameters:

embedding_neurons: 600
Score: 0.8405100107192993

Trial summary
Hyperparameters:
embedding_neurons: 400
Score: 0.8456131815910339

Trial summary
Hyperparameters:
embedding_neurons: 650
Score: 0.8536071181297302

Trial summary
Hyperparameters:
embedding_neurons: 450
Score: 0.8710063695907593

Trial summary
Hyperparameters:
embedding_neurons: 550
Score: 0.8793086409568787


# Test 5
narrowing down on embedding neurons testing 500-800 step size 20

batch size 512
### Search Space
Search space summary
Default search space size: 1
embedding_neurons (Int) {'default': None, 'conditions': [], 'min_value': 500, 'max_value': 800, 'step': 20, 'sampling': None}
### Results
Trial 15 Complete [00h 00m 08s]
val_loss: 0.8540633320808411

Best val_loss So Far: 0.8480988144874573
Total elapsed time: 00h 02m 21s
INFO:tensorflow:Oracle triggered exit
Results summary

Trial summary
Hyperparameters:
embedding_neurons: 500
Score: 0.8480988144874573

Trial summary
Hyperparameters:
embedding_neurons: 640
Score: 0.8524157404899597

Trial summary
Hyperparameters:
embedding_neurons: 600
Score: 0.8538790345191956

Trial summary
Hyperparameters:
embedding_neurons: 680
Score: 0.8540633320808411

Trial summary
Hyperparameters:
embedding_neurons: 520
Score: 0.856784999370575

Trial summary
Hyperparameters:
embedding_neurons: 540
Score: 0.8585261702537537

Trial summary
Hyperparameters:
embedding_neurons: 560
Score: 0.8655727505683899

Trial summary
Hyperparameters:
embedding_neurons: 700
Score: 0.8688831329345703

Trial summary
Hyperparameters:
embedding_neurons: 660
Score: 0.8715007305145264

Trial summary
Hyperparameters:
embedding_neurons: 620
Score: 0.8724114894866943



# Test 6
focusing on number of layers and neurons

batch size 512
### Search Space
Search space summary
Default search space size: 2
num_hidden_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 4, 'step': 1, 'sampling': None}
dense0_neurons (Int)
{'default': None, 'conditions': [], 'min_value': 10, 'max_value': 310, 'step': 100, 'sampling': None}

### Results

Trial 122 Complete [00h 00m 13s]
val_loss: 1.141672968864441

Best val_loss So Far: 0.7372345924377441
Total elapsed time: 00h 36m 48s

Search: Running Trial #123

Hyperparameter    |Value             |Best Value So Far 
num_hidden_layers |4                 |3                 
dense0_neurons    |110               |110               
dense1_neurons    |210               |110               
dense2_neurons    |110               |110               
dense3_neurons    |110               |310 