In [14]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
# Load the dataset
data = pd.read_csv('data/train.csv')

# Preprocess the text
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize, remove stopwords, and lemmatize
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalnum()]
    return ' '.join(filtered_tokens)

data = data[['movie_name', 'synopsis', 'genre']]
data['synopsis'] = data['synopsis'].apply(preprocess_text)

# Preprocess the genres
data['genre'] = data['genre'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genre'])

[nltk_data] Downloading package stopwords to /Users/brad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/brad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Load the pre-trained GloVe embeddings
embeddings_index = {}
path = '../glove.6B/glove.6B.100d.txt'
with open(path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Tokenize and pad the text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['synopsis'])
sequences = tokenizer.texts_to_sequences(data['synopsis'])
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=300)

# Create the embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=300, trainable=False))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(mlb.classes_), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, genres_encoded, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, batch_size=64, epochs=12, validation_data=(X_test, y_test))

Epoch 1/12
 44/675 [>.............................] - ETA: 10:00 - loss: 0.3586 - accuracy: 0.1229

KeyboardInterrupt: 

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_pred_onehot = to_categorical(y_pred_classes, num_classes=10)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_onehot)
precision = precision_score(y_test, y_pred_onehot, average='micro')
recall = recall_score(y_test, y_pred_onehot, average='micro')
f1 = f1_score(y_test, y_pred_onehot, average='micro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.3610185185185185
Precision: 0.3610185185185185
Recall: 0.3610185185185185
F1-score: 0.3610185185185185


In [25]:
# Identify misclassified instances
misclassified = np.where(np.any(y_test != y_pred_onehot, axis=1))[0]

# Analyze misclassified instances and suggest improvements
for idx in misclassified[:10]:
    print(f"Movie: {data['movie_name'][idx]}")
    print(f"Actual genres: {', '.join(data['genre'][idx])}")
    print(f"Predicted genres: {', '.join(mlb.classes_[y_pred_onehot[idx] == 1])}")
    print()

Movie: Super Me
Actual genres: fantasy
Predicted genres: mystery

Movie: Behavioral Family Therapy for Serious Psychiatric Disorders
Actual genres: family
Predicted genres: adventure

Movie: Blood Glacier
Actual genres: scifi
Predicted genres: horror

Movie: Apat na anino
Actual genres: action
Predicted genres: romance

Movie: Le démon dans l'île
Actual genres: horror
Predicted genres: horror

Movie: Candid
Actual genres: horror
Predicted genres: mystery

Movie: Hired
Actual genres: crime
Predicted genres: romance

Movie: Miruthan
Actual genres: adventure
Predicted genres: adventure

Movie: Until You See Me
Actual genres: mystery
Predicted genres: scifi

Movie: Shamus
Actual genres: mystery
Predicted genres: mystery

