In [1]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import to_categorical

2023-11-13 12:38:53.733342: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [19]:
# Load the dataset
data = pd.read_csv('data/train.csv')

# Preprocess the text
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

data = data[['movie_name', 'synopsis', 'genre']]
data['synopsis'] = data['synopsis'].apply(preprocess_text)

# Preprocess the genres
data['genre'] = data['genre'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genre'])

[nltk_data] Downloading package stopwords to /Users/brad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/brad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# Load the pre-trained GloVe embeddings
embeddings_index = {}
path = '../glove.6B/glove.6B.100d.txt'
with open(path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Tokenize and pad the text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['synopsis'])
sequences = tokenizer.texts_to_sequences(data['synopsis'])
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=300)

# Create the embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [6]:
def compile_model(layers=256, dropout=0.2, lr=0.001):# Define the LSTM model
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=300, trainable=False))
    model.add(LSTM(layers, dropout=dropout, recurrent_dropout=dropout))
    model.add(Dense(len(mlb.classes_), activation='sigmoid'))

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=lr), metrics=['accuracy'])
    return model


In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, genres_encoded, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:

def train_model(model, X_train, y_train, X_val, y_val):
    checkpoint_path = "models/LSTM_checkpoint.h5"
    model_checkpoint = ModelCheckpoint(checkpoint_path, save_best_only=True, save_weights_only=True, monitor='val_accuracy', mode='max', verbose=1)
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max', verbose=1)

    history = model.fit(X_train, y_train, batch_size=64, epochs=15, validation_data=(X_val, y_val), callbacks=[model_checkpoint, early_stopping])
    model.load_weights(checkpoint_path)

model = compile_model()
train_model(model, X_train, y_train, X_val, y_val)

Epoch 1/15


KeyboardInterrupt: 

In [7]:
model = compile_model()
model.load_weights('models/LSTM_checkpoint.h5')

In [None]:
def hyperparameters(X, Y, k=5):
    dropouts = [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5]
    hidden = [32, 64, 128, 256, 512]
    learning_rates = [0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]
    high_score = 0
    best_params = {}

    for d in dropouts:
        for h in hidden:
            for lr in learning_rates:
                model = compile_model(h, d, lr)

                kf = KFold(n_splits=k, shuffle=True, random_state=42)
                for train_index, test_index in kf.split(X):
                    X_train, X_val = X[train_index], X[test_index]
                    y_train, y_val = Y[train_index], Y[test_index]

                    checkpoint_path = "models/LSTM_checkpoint.h5"
                    model_checkpoint = ModelCheckpoint(checkpoint_path, save_best_only=True, save_weights_only=True, monitor='val_accuracy', mode='max', verbose=1)
                    early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max', verbose=1)

                    history = model.fit(X_train, y_train, batch_size=64, epochs=15, validation_data=(X_val, y_val), callbacks=[model_checkpoint, early_stopping])
                    best_val = max(history.history['val_accuracy'])
                    if best_val > high_score:
                        high_score = best_val
                        best_params['dropout'] = d
                        best_params['hidden layers'] = h
                        best_params['learning rate'] = lr
    print('Accuracy:', high_score)
    print(best_params)

In [8]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_pred_onehot = to_categorical(y_pred_classes, num_classes=10)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_onehot)
precision = precision_score(y_test, y_pred_onehot, average='macro')
recall = recall_score(y_test, y_pred_onehot, average='macro')
f1 = f1_score(y_test, y_pred_onehot, average='macro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.37407407407407406
Precision: 0.3556678496445766
Recall: 0.37177634037448415
F1-score: 0.3521443048703147


In [15]:
# Identify misclassified instances
misclassified = []
for i in range(5400):
    if (np.any(y_test[i] != y_pred_onehot[i])):
        misclassified.append(i)
misclassified = np.where(np.any(y_test != y_pred_onehot, axis=1))[0]

# Analyze misclassified instances and suggest improvements
for idx in misclassified[:10]:
    print(f"Movie: {data['movie_name'][idx]}")
    print(f"Actual genres: {', '.join(mlb.classes_[y_pred_onehot[idx] == 1])}")
    print(f"Predicted genres: {', '.join(mlb.classes_[y_pred_onehot[idx] == 1])}")
    print()

Movie: Super Me
Actual genres: fantasy
Predicted genres: mystery

Movie: Behavioral Family Therapy for Serious Psychiatric Disorders
Actual genres: family
Predicted genres: adventure

Movie: Blood Glacier
Actual genres: scifi
Predicted genres: horror

Movie: Apat na anino
Actual genres: action
Predicted genres: romance

Movie: Le démon dans l'île
Actual genres: horror
Predicted genres: horror

Movie: Hired
Actual genres: crime
Predicted genres: romance

Movie: Until You See Me
Actual genres: mystery
Predicted genres: scifi

Movie: Shamus
Actual genres: mystery
Predicted genres: mystery

Movie: Crushed
Actual genres: horror
Predicted genres: thriller

Movie: Vampires
Actual genres: horror
Predicted genres: fantasy

