In [1]:
import os
import cupy as cp
import numpy as np
import torch
import pandas as pd
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import logging

# Set TensorFlow logging level to suppress warnings (1: INFO, 2: WARNING, 3: ERROR)
tf.get_logger().setLevel(logging.ERROR)

2023-09-17 15:57:40.730334: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
def list_directories(directory):
    directories = []
    for root, dirs, files in os.walk(directory):
        for dir in dirs:
            directories.append(os.path.join(root, dir))
    return directories

In [7]:
def load_comments():
    comments = []
    dirs = list_directories("../data")
    for dir in dirs:
        with open(dir + "/video_comments.json", 'r') as f:
            video_comments = json.load(f)
        with open(dir + "/video_details.json", 'r') as f:
            details = json.load(f)
            for i in range(len(video_comments)):
                video_comments[i]['video_id'] = details["id"]
                video_comments[i]['video_title'] = details["title"]
                video_comments[i]['video_desc'] = details["description"]
                video_comments[i]['video_views'] = details["views"]
                video_comments[i]['video_likes'] = details["likes"]
                video_comments[i]['video_comments'] = details["comments"]
            
        comments += (video_comments)
    
    return pd.DataFrame(comments)

In [8]:
def preprocess_dataframe(df, max_seq_length):
    # Extract the relevant columns
    input_features = df[['video_title', 'video_views', 'video_likes']]
    output_text = df['text']

    # Tokenize the output text (target)
    target_tokenizer = Tokenizer()
    target_tokenizer.fit_on_texts(output_text)
    num_output_tokens = len(target_tokenizer.word_index) + 1  # +1 for the padding token
    output_sequences = target_tokenizer.texts_to_sequences(output_text)
    padded_output_sequences = pad_sequences(output_sequences, maxlen=max_seq_length, padding='post')

    # Tokenize the input features (video_title, video_views, video_likes)
    input_tokenizer = Tokenizer()
    input_features_text = input_features.apply(lambda x: ' '.join(map(str, x)), axis=1)
    input_tokenizer.fit_on_texts(input_features_text)
    num_input_tokens = len(input_tokenizer.word_index) + 1  # +1 for the padding token
    input_sequences = input_tokenizer.texts_to_sequences(input_features_text)
    padded_input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')

    return padded_input_sequences, padded_output_sequences, num_input_tokens, num_output_tokens

In [9]:
def create_seq2seq_model(num_input_tokens, num_output_tokens, max_seq_length, latent_dim=128):
    # Encoder
    encoder_inputs = Input(shape=(max_seq_length,))
    encoder_embedding = Embedding(input_dim=num_input_tokens, output_dim=latent_dim)(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(max_seq_length,))
    decoder_embedding = Embedding(input_dim=num_output_tokens, output_dim=latent_dim)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(num_output_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

In [10]:
max_seq_length = 4096
BATCH_SIZE = 64
EPOCHS = 10

In [11]:
#Load the data from the files
df = load_comments()
df = df[df['likes'] > 10][df['text'].str.len() > 100]

#Tokenize our data
padded_input_sequences, padded_output_sequences, num_input_tokens, num_output_tokens = preprocess_dataframe(df, max_seq_length)

# Create the seq2seq model
seq2seq_model = create_seq2seq_model(num_input_tokens, num_output_tokens, max_seq_length)

# Compile the model
seq2seq_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
seq2seq_model.summary()

  df = df[df['likes'] > 10][df['text'].str.len() > 100]
2023-09-17 15:26:39.350906: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-09-17 15:26:39.353157: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-09-17 15:26:39.354232: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 4096, 128)    134656      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 4096, 128)    1613824     ['input_2[0][0]']                
                                                                                              

In [10]:
# Train the model
seq2seq_model.fit(
    [padded_input_sequences, padded_output_sequences], 
    padded_output_sequences,print("GPU Available:", tf.config.list_physical_devices('GPU'))
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

Epoch 1/10


2023-09-17 15:24:55.906589: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-09-17 15:24:55.908646: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-09-17 15:24:55.910024: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

: 

: 

In [3]:
print("GPU Available:", tf.config.list_physical_devices('GPU'))
cp.random.random(10)

import torch
torch.cuda.is_available()

GPU Available: []
