In [None]:
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import tensorflow as tf
from keras.metrics import Precision, Recall
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    LSTM,
    Dense,
    Embedding,
    Bidirectional,
    Dropout,
    GlobalMaxPooling1D,
    Concatenate,
    BatchNormalization,
    Conv1D,
    ReLU,
    Input,
    GRU,
)
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import regularizers
from keras.layers import Embedding, Flatten, Dense
import tensorflow_datasets as tfds
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
import os
from keras.preprocessing.sequence import TimeseriesGenerator
from datetime import datetime
from datasets import load_dataset
import json
from nltk.tokenize import word_tokenize
from sklearn.calibration import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
test_df = pd.read_parquet("datasets/COQA-Conversation/test.parquet")
train_df = pd.read_parquet("datasets/COQA-Conversation/train_stanfordnlp.parquet")

In [None]:
def tokenize_text(text):
    return word_tokenize(text)


In [None]:
dir = f"{os.getcwd()}/nltk_datasets"
nltk.data.path.append(dir)
nltk.download("stopwords", download_dir=dir)
nltk.download("punkt", download_dir=dir)
nltk.download("maxent_ne_chunker", download_dir=dir)
nltk.download("words", download_dir=dir)
nltk.download("tagsets", download_dir=dir)
nltk.download("averaged_perceptron_tagger", download_dir=dir)

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower()
                     not in stop_words]
    return " ".join(filtered_text)


def stem_text(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)


def extract_entities(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    ne_chunks = ne_chunk(pos_tags)

    entities = []
    for chunk in ne_chunks:
        if hasattr(chunk, "label") and chunk.label():
            if chunk.label() == "NE":
                entities.append(" ".join([c[0] for c in chunk]))
    return entities


def create_tfidf_vectorizer(df):
    vectorizer = TfidfVectorizer(max_features=10000, use_idf=True)
    # Fit and transform the text data in the DataFrame column
    tfidf_matrix = vectorizer.fit_transform(df["sentence"])
    # Convert the TF-IDF matrix to a DataFrame for visualization
    return tfidf_matrix.toarray()


def tokenize_sentences(df):
    # Flatten the DataFrame to iterate over all cells
    all_text = df.values.flatten()
    
    # Create a tokenizer
    tokenizer = Tokenizer(num_words=10000)
    
    # Fit tokenizer on all text
    tokenizer.fit_on_texts(all_text)
    
    # Tokenize all text
    sequences = tokenizer.texts_to_sequences(all_text)
    
    return sequences


def encode_emotions(emotions):
    encoder = LabelEncoder()
    return encoder.fit_transform(emotions)

def pad_sequences_with_zeros(X, maxlen):
    return pad_sequences(X, maxlen=maxlen)

def extract_and_convert_text(dictionary):
    input_text_list = dictionary.get('input_text', [])
    return ' '.join(input_text_list)

def extract_each_value_from_column(df):
    new_rows = []
    for idx, row in df.iterrows():
        input_texts = row['answers']['input_text']
        questions = row['questions']
        story = row['story']
        # Iterate over each question-answer pair in the row
        for question, input_text in zip(questions, input_texts):
            # Append a new row to the list with the question, answer, and input text
            new_rows.append({'answers': input_text, 'question': question, 'story': story})
        return new_rows


In [148]:
test_df = test_df.map(remove_stopwords)

test_df = test_df.map(stem_text)

# train["entities"] = train["sentence"].apply(extract_entities)
# test["entities"] = test["sentence"].apply(extract_entities)
# val["entities"] = val["sentence"].apply(extract_entities)


test_tfidf = tokenize_sentences(test_df)

test_padded = pad_sequences_with_zeros(test_tfidf, 200)

train_df = train_df.map(remove_stopwords)

train_df = train_df.map(stem_text)

train_tfidf = tokenize_sentences(train_df)

train_padded = pad_sequences_with_zeros(train_tfidf, 200)

In [149]:
train_padded = train_padded.reshape((train_padded.shape[0], train_padded.shape[1], 1))
test_padded = test_padded.reshape((test_padded.shape[0], test_padded.shape[1], 1))

In [151]:
test_padded.shape

(36, 200, 1)

In [156]:
seq_length = None  # Example sequence length
vocab_size = 200  # Example vocabulary size

# Define encoder input
encoder_inputs = Input(shape=(seq_length, vocab_size))

# Define LSTM encoder layer with ndim=2
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Define decoder input
decoder_inputs = Input(shape=(None, vocab_size))  # Corrected decoder input shape

# Define LSTM decoder layer with ndim=2
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# Define Dense layer for output
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "model_18"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_48 (InputLayer)       [(None, None, 200)]          0         []                            
                                                                                                  
 input_49 (InputLayer)       [(None, None, 200)]          0         []                            
                                                                                                  
 lstm_43 (LSTM)              [(None, 256),                467968    ['input_48[0][0]']            
                              (None, 256),                                                        
                              (None, 256)]                                                        
                                                                                           

In [157]:
model.fit([test_padded, test_padded], epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10


ValueError: in user code:

    File "/home/ubuntu/ml-learning/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/home/ubuntu/ml-learning/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/ubuntu/ml-learning/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/home/ubuntu/ml-learning/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/home/ubuntu/ml-learning/venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/ubuntu/ml-learning/venv/lib/python3.10/site-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'model_18' (type Functional).
    
    Input 0 of layer "lstm_43" is incompatible with the layer: expected shape=(None, None, 200), found shape=(None, 200, 1)
    
    Call arguments received by layer 'model_18' (type Functional):
      • inputs=('tf.Tensor(shape=(None, 200, 1), dtype=int32)', 'tf.Tensor(shape=(None, 200, 1), dtype=int32)')
      • training=True
      • mask=None


#### get vocabulary size

In [None]:
from collections import Counter

# Example DataFrame

# Combine all text data from all columns into a single string
all_text = ' '.join(test_df.stack().astype(str))

# Tokenize the text into words
words = all_text.split()

# Count the frequency of each word
word_counts = Counter(words)

# Calculate the vocabulary size
vocab_size = len(word_counts)

print("Vocabulary Size:", vocab_size)


### extract each questions and answer and save processed data

In [None]:
new_rows = extract_each_value_from_column(test_df)
# Create a new DataFrame from the list of new rows
test_df = pd.DataFrame(new_rows)

new_rows = extract_each_value_from_column(train_df)
# Create a new DataFrame from the list of new rows
train_df = pd.DataFrame(new_rows)


In [None]:
data_array = train_df.to_numpy()
np.savez_compressed('datasets/COQA-Conversation/COQA_train_processed.npz', data=data_array)

data_array = test_df.to_numpy()
np.savez_compressed('datasets/COQA-Conversation/COQA_test_processed.npz', data=data_array)