In [None]:
import pandas as pd
import string
import numpy as np
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import keras.utils as ku
import tensorflow as tf

# Set random seeds for reproducibility
tf.random.set_seed(2)
from numpy.random import seed
seed(1)

# Function to load CSV and JSON files
def load_files():
    csv_files = input("Enter CSV file paths separated by commas: ").split(',')
    json_files = input("Enter JSON file paths separated by commas: ").split(',')

    dataframes = [pd.read_csv(file.strip()) for file in csv_files]
    json_data = [json.load(open(file.strip())) for file in json_files]

    return dataframes, json_data

# Function to extract category information from JSON files
def category_extractor(data):
    i_d = [int(data['items'][i]['id']) for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    return dict(zip(i_d, title))

# Function to preprocess the data
def preprocess_data(dataframes, json_data):
    for i, df in enumerate(dataframes):
        df['category_title'] = df['category_id'].map(category_extractor(json_data[i]))

    df = pd.concat(dataframes, ignore_index=True)
    df = df.drop_duplicates('video_id')

    entertainment = df[df['category_title'] == 'Entertainment']['title'].tolist()
    return entertainment

# Function to clean text
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    return text.encode('utf8').decode('ascii', 'ignore')

# Function to tokenize the corpus
def get_sequence_of_tokens(corpus):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i + 1]
            input_sequences.append(n_gram_sequence)

    return input_sequences, total_words, tokenizer

# Function to generate padded sequences
def generate_padded_sequences(input_sequences, total_words):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

# Function to create the model
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=input_len))
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Function to generate text
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += " " + output_word
    return seed_text.title()

# Main flow
if __name__ == "__main__":
    dataframes, json_data = load_files()
    entertainment_titles = preprocess_data(dataframes, json_data)

    # Clean the corpus
    corpus = [clean_text(title) for title in entertainment_titles]

    # Tokenize the corpus
    inp_sequences, total_words, tokenizer = get_sequence_of_tokens(corpus)

    # Generate padded sequences
    predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words)

    # Create and train the model
    model = create_model(max_sequence_len, total_words)
    model.fit(predictors, label, epochs=20, verbose=1)

    # Take user input for text generation
    seed_text = input("Enter the seed text: ")
    next_words = int(input("Enter the number of words to generate: "))
    
    generated_text = generate_text(seed_text, next_words, model, tokenizer, max_sequence_len)
    print("Generated Text: ", generated_text)



Enter CSV file paths separated by commas:  USvideos.csv, CAvideos.csv, GBvideos.csv
Enter JSON file paths separated by commas:  US_category_id.json, CA_category_id.json, GB_category_id.json




Epoch 1/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 22ms/step - loss: 8.2121
Epoch 2/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - loss: 7.1377
Epoch 3/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 25ms/step - loss: 6.6387
Epoch 4/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 22ms/step - loss: 6.2566
Epoch 5/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - loss: 5.9094
Epoch 6/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 23ms/step - loss: 5.6036
Epoch 7/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 22ms/step - loss: 5.3280
Epoch 8/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - loss: 5.0817
Epoch 9/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 23ms/step - loss: 4.8385
Epoch 10/20
[1m2202/2202[0m [32m━━━━━━━━━━━━━━━━━━━━