In [20]:
import os
import numpy as np 
import pandas as pd
import random
import keras
import torch
import tensorflow as tf
import optuna
from optuna import Trial
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from sklearn.metrics import log_loss
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
from sklearn import metrics
SEED = 99
def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    tf.random.set_seed(SEED)
random_seed(SEED)
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [38]:
# Assuming your data is in a CSV file named 'movie_data.csv'
data = pd.read_csv('movies_initial.csv')

data = data[['genre','fullplot']]

data = data.dropna(subset=['genre', 'fullplot'])

data_s=data
data = data_s.sample(frac=0.01, random_state=42)

print(data)

                           genre  \
33876             Drama, Romance   
20454                     Comedy   
25431              Action, Drama   
3451                     Western   
36880                   Thriller   
...                          ...   
42556  Biography, Drama, History   
29156                Documentary   
7459     Biography, Drama, Music   
20628       Drama, Comedy, Crime   
20593                   Thriller   

                                                fullplot  
33876  Ronnie's (Miley Cyrus) and her younger brother...  
20454  A middle-aged man's conservative life is distu...  
25431  In Los Angeles, an ex-con takes the undergroun...  
3451   After Confederate officer Blayde Hollister's h...  
36880  An engaged couple's backpacking trip in the Ca...  
...                                                  ...  
42556  The movie depicts the political crisis that le...  
29156  We open with stories of one man's brilliant id...  
7459   One of the greatest masterpie

In [39]:
print("Empty values in the DataFrame:")
print(data.isnull().values.any())

Empty values in the DataFrame:
False


In [49]:

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=111)

# Convert genre labels into a binary matrix for multi-label classification
y_train = train_data['genre'].apply(lambda x: x.split(', '))
y_test = test_data['genre'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

sentences = data['fullplot'].tolist()

print(mlb.classes_)

['Action' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'Film-Noir' 'History' 'Horror'
 'Music' 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Short' 'Sport' 'Thriller'
 'War' 'Western']


In [54]:
model = SentenceTransformer('bert-base-uncased')
model.max_seq_length = 512
print("Max Sequence Length:", model.max_seq_length)
sentence_embeddings = model.encode(sentences)
# Assuming 'genre' is a list of genres for each movie
# Convert the 'genre' column to a DataFrame with separate columns for each tag
genres_df = pd.DataFrame(data['genre'].tolist(), columns=['Action' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'Film-Noir' 'History' 'Horror'
 'Music' 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Short' 'Sport' 'Thriller'
 'War' 'Western'])

# Display the resulting DataFrame
print(genres_df)

No sentence-transformers model found with name C:\Users\nchha/.cache\torch\sentence_transformers\bert-base-uncased. Creating a new one with MEAN pooling.


Max Sequence Length: 512
    ActionAdventureAnimationBiographyComedyCrimeDocumentaryDramaFamilyFantasyFilm-NoirHistoryHorrorMusicMusicalMysteryRomanceSci-FiShortSportThrillerWarWestern
0                                       Drama, Romance                                                                                                         
1                                               Comedy                                                                                                         
2                                        Action, Drama                                                                                                         
3                                              Western                                                                                                         
4                                             Thriller                                                                                                         
..             

In [50]:
train1_x, test_x, train1_y, test_y = train_test_split(sentence_embeddings, 
                                                      mlb.classes_, 
                                                      train_size=0.8, 
                                                      test_size=0.2, 
                                                      random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [412, 23]

In [45]:
def objective(trial):
    keras.backend.clear_session()
    train_x, valid_x, train_y, valid_y = train_test_split(train1_x, train1_y, train_size=0.8, test_size=0.2,
                                                                random_state=42)
    #optimum number of hidden layers
    n_layers = trial.suggest_int('n_layers', 1, 3)
    model = keras.Sequential()
    for i in range(n_layers):
        #optimum number of hidden nodes
        num_hidden = trial.suggest_int(f'n_units_l{i}', 48, len(sentence_embeddings[0]), log=True)
        #optimum activation function
        model.add(keras.layers.Dense(num_hidden, input_shape=(len(sentence_embeddings[0]),),
                               activation=trial.suggest_categorical(f'activation{i}', ['relu', 'linear','swish'])))
        #optimum dropout value
        model.add(keras.layers.Dropout(rate = trial.suggest_float(f'dropout{i}', 0.0, 0.6))) 
    model.add(keras.layers.Dense(5,activation=tf.keras.activations.sigmoid)) #output Layer
    val_ds = (valid_x,valid_y)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,patience=1,min_lr=1e-05,verbose=0)
    early_stoping = EarlyStopping(monitor="val_loss",min_delta=0,patience=5,verbose=0,mode="auto", baseline=None,restore_best_weights=True)
    model.compile(loss='binary_crossentropy',metrics='categorical_crossentropy', optimizer='Adam')
    #optimum batch size
    histroy = model.fit(train_x,train_y, validation_data=val_ds,epochs=200,callbacks=[reduce_lr,early_stoping],verbose=0,
                       batch_size=trial.suggest_int('size', 8, 128))
    return min(histroy.history['val_loss'])

In [46]:
if __name__ == "__main__":
  study = optuna.create_study(direction="minimize")
  study.optimize(objective, n_trials=50, timeout=1200)
  print("Number of finished trials: {}".format(len(study.trials)))
  print("Best trial:")
  trial = study.best_trial
  print("  Value: {}".format(trial.value))

[I 2023-11-23 16:27:53,812] A new study created in memory with name: no-name-a43fda87-a8e7-47a5-bbc6-19574754dc95





[W 2023-11-23 16:27:53,886] Trial 0 failed with parameters: {} because of the following error: NameError("name 'train1_x' is not defined").
Traceback (most recent call last):
  File "C:\Users\nchha\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\nchha\AppData\Local\Temp\ipykernel_31088\1409428129.py", line 3, in objective
    train_x, valid_x, train_y, valid_y = train_test_split(train1_x, train1_y, train_size=0.8, test_size=0.2,
                                                          ^^^^^^^^
NameError: name 'train1_x' is not defined
[W 2023-11-23 16:27:53,887] Trial 0 failed with value None.


NameError: name 'train1_x' is not defined