In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import re
import nltk
from nltk.corpus import stopwords
from langdetect import detect  # Language detection to filter non-English texts

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from numpy.random import seed

# Download necessary nltk packages
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# Ensure reproducibility
seed(1)
tf.random.set_seed(2)

# Load datasets (load from files or sample from larger datasets)
df1 = pd.read_csv('USvideos.csv').sample(n=1000)  # Sample 1000 for better training quality
df2 = pd.read_csv('CAvideos.csv').sample(n=1000)
df3 = pd.read_csv('GBvideos.csv').sample(n=1000)

# Load the category data (replace with your file paths)
data1 = json.load(open('US_category_id.json'))
data2 = json.load(open('CA_category_id.json'))
data3 = json.load(open('GB_category_id.json'))

# Category extractor function
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    i_d = list(map(int, i_d))
    category = dict(zip(i_d, title))
    return category

# Map category titles
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

# Combine the datasets and remove duplicates
df = pd.concat([df1, df2, df3], ignore_index=True)
df = df.drop_duplicates('video_id')

# Function to clean the text and detect only English titles
def clean_text(text):
    # Detect language and only keep English titles
    try:
        if detect(text) != 'en':
            return ''
    except:
        return ''  # Return empty string for titles that fail detection

    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation and lowercase
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])  # Remove stopwords
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Filter for 'Entertainment' videos (category_id = 24)
entertainment = df[df['category_id'] == 24]['title'].tolist()

# Clean the text data
corpus = [clean_text(title) for title in entertainment if clean_text(title)]

# Tokenizer with a larger vocabulary size (relax restriction to 10,000)
tokenizer = Tokenizer(oov_token="<OOV>", filters='', num_words=10000)  # Larger vocabulary size
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Convert text to sequences of tokens
def get_sequence_of_tokens(corpus):
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

input_sequences = get_sequence_of_tokens(corpus)

# Padding sequences
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])  # Find max sequence length dynamically
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

# Build the model with Bidirectional LSTM
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()

    # Embedding layer
    model.add(Embedding(total_words, 150, input_length=input_len))  # Larger embedding for richer representations

    # Bidirectional LSTM layer
    model.add(Bidirectional(LSTM(200, return_sequences=True)))  # Increased LSTM units for more context
    model.add(Dropout(0.3))  # Higher dropout to prevent overfitting

    # LSTM layer
    model.add(LSTM(150))
    model.add(Dropout(0.3))

    # Dense layers
    model.add(Dense(150, activation='relu'))
    model.add(Dense(total_words, activation='softmax'))

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = create_model(max_sequence_len, total_words)

# Train the model with early stopping and a slightly lower batch size
early_stopping = EarlyStopping(monitor='loss', patience=5)
history = model.fit(predictors, label, epochs=100, batch_size=64, verbose=1, callbacks=[early_stopping])

# Text generation function using beam search
def generate_text_beam_search(seed_text, next_words, model, max_sequence_len, beam_width=3, temperature=1.0):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    beam = [(token_list, 0)]  # List of (sequence, score) tuples

    for _ in range(next_words):
        all_candidates = []
        for seq, score in beam:
            predictions = model.predict(seq, verbose=0)
            predictions = predictions.astype('float64')

            # Apply temperature scaling
            predictions = np.log(predictions + 1e-9) / temperature
            exp_preds = np.exp(predictions)
            predictions = exp_preds / np.sum(exp_preds)

            # Get top candidates based on beam width
            top_candidates = np.argsort(predictions[0])[-beam_width:]
            for candidate in top_candidates:
                candidate_seq = np.append(seq, candidate)
                candidate_seq = candidate_seq.reshape(1, -1)
                candidate_score = score - np.log(predictions[0][candidate])
                all_candidates.append((candidate_seq, candidate_score))

        # Sort candidates by score and keep top beam_width candidates
        beam = sorted(all_candidates, key=lambda x: x[1])[:beam_width]

    # Get the best sequence from beam search
    best_sequence = beam[0][0]
    best_sequence = best_sequence.flatten().tolist()
    output_text = ' '.join([tokenizer.index_word[i] for i in best_sequence if i != 0])
    return output_text.title()

# Generate example text with beam search and temperature control
print(generate_text_beam_search("spiderman", 3, model, max_sequence_len, beam_width=3, temperature=0.7))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/100




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 7.6007
Epoch 2/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 7.2926
Epoch 3/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 6.8339
Epoch 4/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 6.5100
Epoch 5/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 6.5254
Epoch 6/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 6.3233
Epoch 7/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 6.0607
Epoch 8/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 6.9146
Epoch 9/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 6.8456
Epoch 10/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 6.7574
Epoch