In [8]:
import pandas as pd
import string
import numpy as np
import json

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

import tensorflow as tf
tf.random.set_seed(2)
from numpy.random import seed
seed(1)

In [9]:
# Load the dataset
df1 = pd.read_csv('/content/drive/MyDrive/Dataset/Trending YouTube Video Statistics/USvideos.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Dataset/Trending YouTube Video Statistics/CAvideos.csv')
df3 = pd.read_csv('/content/drive/MyDrive/Dataset/Trending YouTube Video Statistics/GBvideos.csv')

In [10]:
# Load the datasets containing the category names
data1 = json.load(open('/content/drive/MyDrive/Dataset/Trending YouTube Video Statistics/US_category_id.json'))
data2 = json.load(open('/content/drive/MyDrive/Dataset/Trending YouTube Video Statistics/CA_category_id.json'))
data3 = json.load(open('/content/drive/MyDrive/Dataset/Trending YouTube Video Statistics/GB_category_id.json'))

In [11]:
# Function to extract category information
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    i_d = list(map(int, i_d))
    category = zip(i_d, title)
    category = dict(category)
    return category

In [12]:
# Create category columns
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

In [13]:
# Concatenate dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)

In [14]:
# Remove duplicate videos
df = df.drop_duplicates('video_id')

In [15]:
# Select entertainment category
entertainment = df[df['category_title'] == 'Entertainment']['title']
entertainment = entertainment.tolist()

In [16]:
# Function to clean text
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text

In [17]:
# Clean text corpus
corpus = [clean_text(e) for e in entertainment]

In [18]:
# Tokenization and sequence generation
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [19]:
# Padding sequences
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,  maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:,:-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [20]:
# Create LSTM model
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=input_len))
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = create_model(max_sequence_len, total_words)

In [21]:
# Train the model
model.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7cf1d3077a30>

In [26]:
# Function to generate titles
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted_probs)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [34]:
# Test model
generated_title = generate_text("Spiderman", 5, model, max_sequence_len)
print(generated_title)

Spiderman 2 Soldado Trailer 2018 Clip
