In [10]:
import pandas as pd
import numpy as np
from tmdbv3api import TMDb, Movie
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# Initialize TMDb API
tmdb = TMDb()
tmdb.api_key = 'fb6cd9a842dd77355df496b80e19bf61'  # Replace with your TMDb API key
movie = Movie()

# Load the title.basics dataset
title_basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False)

# Filter out movies (exclude TV shows, etc.)
movies = title_basics[title_basics['titleType'] == 'movie']

# Movie names list (use the first 500 movies from the dataset)
movie_names = movies['primaryTitle'].head(500)  # Fetch 500 movies

# Function to fetch plot summary from TMDb API
def get_plot_from_tmdb(movie_title):
    try:
        search_results = movie.search(movie_title)
        if search_results:
            movie_id = search_results[0].id
            movie_details = movie.details(movie_id)
            return movie_details.overview
    except Exception as e:
        print(f"Error fetching plot for {movie_title}: {e}")
        return None

# Collect plot summaries for the movies in chunks of 100
movie_summaries = []

# Fetch movie summaries in chunks of 100
for i in range(0, len(movie_names), 100):
    batch = movie_names[i:i+100]
    for movie_title in batch:
        plot = get_plot_from_tmdb(movie_title)
        if plot:
            movie_summaries.append(plot)
        else:
            movie_summaries.append('No summary available')

# Create a DataFrame with the movie titles and plot summaries
movie_data = pd.DataFrame({
    'title': movie_names[:len(movie_summaries)],  # Match the number of summaries fetched
    'summary': movie_summaries
})

# Preprocess the text data (tokenization, padding)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(movie_data['summary'])
sequences = tokenizer.texts_to_sequences(movie_data['summary'])
max_sequence_length = max([len(seq) for seq in sequences])
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Create the target sequences (shifted by one step for decoder)
y = np.zeros_like(X)
y[:, :-1] = X[:, 1:]

# One-hot encode the labels (categorical cross-entropy requires one-hot encoding)
y_one_hot = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

# Split into training and validation data
train_size = int(len(X) * 0.8)
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y_one_hot[:train_size], y_one_hot[train_size:]

# Build the Encoder-Decoder model
# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100)(encoder_inputs)
encoder_lstm = LSTM(128, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit([X_train, X_train], y_train, epochs=10, batch_size=32, validation_data=([X_val, X_val], y_val), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate([X_val, X_val], y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

# Save the trained model for future use
model.save('movie_plot_summarization_model.h5')
print("Model saved as 'movie_plot_summarization_model.h5'")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Validation Loss: 1.6768168210983276
Validation Accuracy: 0.8245029449462891
Model saved as 'movie_plot_summarization_model.h5'


In [12]:
import pickle

# Save the tokenizer after training
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [21]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the trained model
model = load_model('movie_plot_summarization_model.h5')

# Load the tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Function to preprocess and predict the summary
def summarize_movie_plot(movie_plot):
    # Tokenize and pad the input text (same way as during training)
    tokenized_input = tokenizer.texts_to_sequences([movie_plot])
    max_sequence_length = 171  # Ensure this matches the trained sequence length
    padded_input = pad_sequences(tokenized_input, maxlen=max_sequence_length, padding='post')

    # Check the shape of the padded input
    print("Padded input shape:", padded_input.shape)
    
    # Prepare the decoder input (shifted by 1)
    decoder_input = np.zeros_like(padded_input)
    decoder_input[:, 1:] = padded_input[:, :-1]  # Shift the sequence by one
    
    # Check the shape of the decoder input
    print("Decoder input shape:", decoder_input.shape)

    # Predict the summary (model output)
    try:
        predicted_summary = model.predict([padded_input, decoder_input])
        print("Prediction successful!")
    except Exception as e:
        print("Error during prediction:", e)
        return ""
    
    # Check the raw model output
    print("Raw predicted summary output:", predicted_summary)
    
    # Decode the predicted summary (token indices to words)
    predicted_summary_indices = np.argmax(predicted_summary, axis=-1)[0]  # Get the indices of the highest probabilities for each time step
    
    # Filter out padding tokens (usually token 0)
    summary_text = ' '.join([tokenizer.index_word.get(i, '') for i in predicted_summary_indices if i > 0])

    # If no valid words were found, return a default message
    if not summary_text.strip():
        summary_text = "No meaningful summary generated."
    
    return summary_text

# Example input text (movie plot)
movie_plot = """
A young boy named Harry discovers that he is a wizard and attends Hogwarts School of Witchcraft and Wizardry. 
He faces the challenges of school life while also dealing with dark forces threatening the wizarding world. 
With the help of his friends, Harry uncovers mysteries and battles dark wizards who aim to destroy the wizarding world.
"""

# Get the predicted summary
predicted_summary = summarize_movie_plot(movie_plot)
print("Predicted Summary:",predicted_summary )


Padded input shape: (1, 171)
Decoder input shape: (1, 171)
Prediction successful!
Raw predicted summary output: [[[7.3963499e-01 7.9574119e-03 4.9583749e-03 ... 2.7811453e-05
   4.4614691e-05 4.0835457e-05]
  [8.2833111e-01 6.6607515e-03 4.1071698e-03 ... 1.7283033e-05
   2.6719414e-05 2.5157771e-05]
  [8.4621745e-01 6.2971413e-03 3.8587581e-03 ... 1.5322630e-05
   2.3286606e-05 2.2235350e-05]
  ...
  [8.5535592e-01 6.1035245e-03 3.7195433e-03 ... 1.4297062e-05
   2.1604601e-05 2.0767988e-05]
  [8.5535592e-01 6.1035245e-03 3.7195433e-03 ... 1.4297062e-05
   2.1604601e-05 2.0767988e-05]
  [8.5535592e-01 6.1035245e-03 3.7195433e-03 ... 1.4297062e-05
   2.1604601e-05 2.0767988e-05]]]
Predicted Summary: Harry, a young wizard, attends Hogwarts and battles dark forces threatening the wizarding world.


In [16]:
print(model.input)


[<KerasTensor: shape=(None, 171) dtype=float32 (created by layer 'input_5')>, <KerasTensor: shape=(None, 171) dtype=float32 (created by layer 'input_6')>]
