In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
!pip install tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model



In [3]:
import os

# File paths
train_path = r'C:\Users\cp756\OneDrive\Desktop\archive\Genre Classification Dataset\train_data.txt'
test_path = r'C:\Users\cp756\OneDrive\Desktop\archive\Genre Classification Dataset\test_data.txt'
desc_path = r'C:\Users\cp756\OneDrive\Desktop\archive\Genre Classification Dataset\description.txt'

# Check if files exist
if not all(map(os.path.exists, [train_path, test_path, desc_path])):
    raise FileNotFoundError("One or more files were not found. Please check the file paths.")

try:
    with open(train_path, 'r', encoding='utf-8') as train_file:
        train_lines_new = train_file.readlines()
        train_data = [line.strip().split(' ::: ') for line in train_lines_new]

    with open(test_path, 'r', encoding='utf-8') as test_file:
        test_lines_new = test_file.readlines()
        test_data = [line.strip().split(' ::: ') for line in test_lines_new]

    with open(desc_path, 'r', encoding='utf-8') as desc_file:
        descriptions = desc_file.readlines()

except FileNotFoundError as e:
    print(f"File not found: {e}")
except IOError as e:
    print(f"IO error occurred: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [4]:
all_descriptions = [item[3] for item in train_data] + [item[2] for item in test_data] + descriptions

In [5]:
labels = [item[2] for item in train_data] + [None for _ in test_data]

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(all_descriptions)

In [8]:
sequences = tokenizer.texts_to_sequences(all_descriptions)

In [9]:
# Pading sequences to a fixed length
max_sequence_length = 100  # adjusting this based on our data
sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [10]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [11]:
train_sequences = sequences[:len(train_data)]
test_sequences = sequences[len(train_data):len(train_data) + len(test_data)]

In [12]:
X_train, X_val, y_train, y_val = train_test_split(train_sequences, encoded_labels[:len(train_data)], test_size=0.2, random_state=42)

In [13]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))



In [14]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
from sklearn.preprocessing import LabelBinarizer
from keras.callbacks import  EarlyStopping
from keras.optimizers import Adam

# Sample data (you should replace this with your actual dataset)
texts = [
    "This movie was great and thrilling", 
    "The film was boring and slow", 
    "An excellent horror movie with a gripping story", 
    "A terrible film with poor acting", 
    "A masterpiece of modern cinema"
]
genres = ["Action", "Drama", "Horror", "Drama", "Action"]

# Preprocessing
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)

# One-hot encode the genre labels
label_binarizer = LabelBinarizer()
y = label_binarizer.fit_transform(genres)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the path to save the best model
model_save_path = r'C:\Users\cp756\OneDrive\Desktop\archive\Genre Classification Dataset\movie_genre_classifier.keras'

# Create a ModelCheckpoint callback
checkpoint = ModelCheckpoint(
    filepath=model_save_path,
    monitor='val_accuracy',    # Monitor validation accuracy
    save_best_only=True,       # Save only the best model
    mode='max',                # Maximize the monitored quantity
    verbose=1                  # Verbosity mode (0 or 1)
)

# Create an EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',        # Monitor validation loss
    patience=3,                # Number of epochs with no improvement before stopping
    verbose=1,                 # Verbosity mode (0 or 1)
    restore_best_weights=True  # Restore model weights from the best epoch
)

# List of callbacks to be used during training
callbacks_list = [checkpoint, early_stopping]

# Model architecture
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),  # Embedding layer
    Bidirectional(LSTM(64, return_sequences=True)),  # Bidirectional LSTM
    Dropout(0.5),                                    # Dropout for regularization
    LSTM(32),                                        # LSTM layer
    Dropout(0.5),                                    # Dropout for regularization
    Dense(y.shape[1], activation='softmax')          # Output layer for multi-class classification
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),             # Adam optimizer
    loss='categorical_crossentropy',                 # Loss function for multi-class classification
    metrics=['accuracy']                             # Metrics to evaluate during training
)

# Print model summary
model.summary()

# Training the model
history = model.fit(
    X_train, 
    y_train, 
    validation_data=(X_val, y_val),                  # Validation data
    epochs=20,                                       # Number of training epochs
    batch_size=32,                                   # Batch size
    callbacks=callbacks_list                         # Use the ModelCheckpoint and EarlyStopping callbacks
)

# After training, the best model is saved at 'movie_genre_classifier.keras'
print(f"Best model saved at: {model_save_path}")




Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.2500 - loss: 1.1002
Epoch 1: val_accuracy improved from -inf to 0.00000, saving model to C:\Users\cp756\OneDrive\Desktop\archive\Genre Classification Dataset\movie_genre_classifier.keras
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step - accuracy: 0.2500 - loss: 1.1002 - val_accuracy: 0.0000e+00 - val_loss: 1.1331
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.2500 - loss: 1.1116
Epoch 2: val_accuracy did not improve from 0.00000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 0.2500 - loss: 1.1116 - val_accuracy: 0.0000e+00 - val_loss: 1.1233
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.5000 - loss: 1.0892
Epoch 3: val_accuracy did not improve from 0.00000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - a

In [26]:
batch_size = 32
epochs = 10
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs, callbacks=callbacks_list)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - accuracy: 0.5000 - loss: 1.0878
Epoch 1: val_accuracy did not improve from 0.00000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 261ms/step - accuracy: 0.5000 - loss: 1.0878 - val_accuracy: 0.0000e+00 - val_loss: 1.1399
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - accuracy: 0.5000 - loss: 1.0616
Epoch 2: val_accuracy did not improve from 0.00000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step - accuracy: 0.5000 - loss: 1.0616 - val_accuracy: 0.0000e+00 - val_loss: 1.1684
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.5000 - loss: 1.0611
Epoch 3: val_accuracy did not improve from 0.00000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step - accuracy: 0.5000 - loss: 1.0611 - val_accuracy: 0.0000e+00 - val_loss: 1.1990
Epoch 3: early stopping
Restoring 

In [30]:
from tensorflow.keras.models import load_model

model.save(r'C:\Users\cp756\OneDrive\Desktop\archive\Genre Classification Dataset\moviegenere.h5')



In [36]:
loaded_model = load_model(r'C:\Users\cp756\OneDrive\Desktop\archive\Genre Classification Dataset\moviegenere.h5')



In [38]:
# Test the model with a sample movie summary
movie_summary = [" In The Secret Garden, a young girl named Mary, who is orphaned and lonely, discovers a hidden, neglected garden on her uncles estate. As she works to revive the garden, she also begins to heal emotionally and make new friends. This heartwarming story explores themes of growth, friendship, and the transformative power of nature."]
movie_summary_sequence = tokenizer.texts_to_sequences(movie_summary)
movie_summary_padded = pad_sequences(movie_summary_sequence, maxlen=max_sequence_length)
predicted_label = loaded_model.predict(movie_summary_padded)

predicted_genre = label_encoder.inverse_transform([np.argmax(predicted_label)])
print(f"Predicted Genre: {predicted_genre[0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Genre: action
