In [None]:
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import tensorflow as tf
from keras.metrics import Precision, Recall
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Dropout, GlobalMaxPooling1D, \
Concatenate, BatchNormalization, Conv1D, ReLU, Input
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.metrics import confusion_matrix
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import regularizers
from keras.layers import Embedding, Flatten, Dense
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import LabelEncoder

### Load datasets and download NLTK lib init functions

In [None]:
test = pd.read_csv("datasets/test.txt", sep=";")
train = pd.read_csv("datasets/train.txt", sep=";")
val = pd.read_csv("datasets/val.txt", sep=";")

In [None]:
dir = f"{os.getcwd()}/nltk_datasets"
nltk.data.path.append(dir)
nltk.download("stopwords", download_dir=dir)
nltk.download("punkt", download_dir=dir)
nltk.download("maxent_ne_chunker", download_dir=dir)
nltk.download("words", download_dir=dir)
nltk.download("tagsets", download_dir=dir)
nltk.download("averaged_perceptron_tagger", download_dir=dir)

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower()
                     not in stop_words]
    return " ".join(filtered_text)


def stem_text(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)


def extract_entities(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    ne_chunks = ne_chunk(pos_tags)

    entities = []
    for chunk in ne_chunks:
        if hasattr(chunk, "label") and chunk.label():
            if chunk.label() == "NE":
                entities.append(" ".join([c[0] for c in chunk]))
    return entities


def create_tfidf_vectorizer(df):
    vectorizer = TfidfVectorizer(max_features=10000, use_idf=True)
    # Fit and transform the text data in the DataFrame column
    tfidf_matrix = vectorizer.fit_transform(df["sentence"])
    # Convert the TF-IDF matrix to a DataFrame for visualization
    return tfidf_matrix.toarray()


def tokenize_sentences(df):
    tr_text = df['sentence']
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(tr_text) 
                                
    sequences = tokenizer.texts_to_sequences(tr_text)
    return sequences


def encode_emotions(emotions):
    encoder = LabelEncoder()
    return encoder.fit_transform(emotions)

def pad_sequences_with_zeros(X, maxlen):
    return pad_sequences(X, maxlen=maxlen)

def check_frequency_of_words(df):
    # Combine all sentences into one string
    all_sentences = ' '.join(df['sentence'].tolist())

    # Tokenize the combined text into words
    words = all_sentences.split()

    # Create a Pandas Series to count word frequencies
    word_freq = pd.Series(words).value_counts()

    # Plot the top 20 most frequent words
    plt.figure(figsize=(10, 6))
    word_freq.head(20).plot(kind='bar', color='skyblue')
    plt.title('Top 20 Most Frequent Words in Sentences')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

### pre-process textual data

In [None]:
train["sentence"] = train["sentence"].apply(remove_stopwords)
test["sentence"] = test["sentence"].apply(remove_stopwords)
val["sentence"] = val["sentence"].apply(remove_stopwords)

train["sentence"] = train["sentence"].apply(stem_text)
test["sentence"] = test["sentence"].apply(stem_text)
val["sentence"] = val["sentence"].apply(stem_text)

# train["entities"] = train["sentence"].apply(extract_entities)
# test["entities"] = test["sentence"].apply(extract_entities)
# val["entities"] = val["sentence"].apply(extract_entities)

train["sentence"] = train["sentence"].str.lower()
test["sentence"] = test["sentence"].str.lower()
val["sentence"] = val["sentence"].str.lower()

train_tfidf = tokenize_sentences(train)
test_tfidf = tokenize_sentences(test)
val_tfidf = tokenize_sentences(val)

train_padded = pad_sequences_with_zeros(train_tfidf, 50)
test_padded = pad_sequences_with_zeros(test_tfidf, 50)
val_padded = pad_sequences_with_zeros(val_tfidf, 50)

train['emotion_en'] = encode_emotions(train['emotion'])
test['emotion_en'] = encode_emotions(test['emotion'])
val['emotion_en'] = encode_emotions(val['emotion'])
# lets do lammentization next time

In [None]:
emotion_en_decrypted = train[['emotion','emotion_en']]
emotion_en_decrypted = emotion_en_decrypted.drop_duplicates()

### Save numpy dataset to reload back

In [None]:
np.savez_compressed(f"{os.getcwd()}/datasets/preprocessed_data/nlp_training.npz", train=train_tfidf,test=test_tfidf,val=val_tfidf)

In [None]:
main_arr_npz = np.load(f"{os.getcwd()}/datasets/preprocessed_data/nlp_training.npz")

In [None]:
main_arr_npz['train'] 

In [None]:
y_train = to_categorical(train["emotion_en"])
y_test = to_categorical(test["emotion_en"])
y_val = to_categorical(val["emotion_en"])

## Train model

### our model

In [217]:
model = Sequential([
    Embedding(10000, 64, input_length=50),
    LSTM(64,return_sequences=False),
    Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.01)),
    Dense(32, activation='relu',kernel_regularizer=regularizers.l2(0.01)),
    Dense(6, activation='softmax')
])

model.compile(optimizer='adamax', loss='categorical_crossentropy',
              metrics=['accuracy'])

### siamese model

In [None]:
input_shape_1 = (50,)  # Replace with your input shape
input_shape_2 = (50,)  # Replace with your input shape

# Define branches for each input
input_1 = Input(shape=input_shape_1)
input_2 = Input(shape=input_shape_2)

# Embedding layer for each input
embedding_1 = Embedding(10000, 64, input_length=50)(input_1)
embedding_2 = Embedding(10000, 64, input_length=50)(input_2)

# Flatten or any necessary layers for each branch
flatten_1 = Flatten()(embedding_1)
flatten_2 = Flatten()(embedding_2)

# Concatenate the branches
concatenated = Concatenate()([flatten_1, flatten_2])

# Dense layers and final output layer
dense_layer = Dense(64, activation='relu')(concatenated)
output_layer = Dense(6, activation='softmax')(dense_layer)

# Create the model
model = Model(inputs=[input_1, input_2], outputs=output_layer)

# Compile the model
model.compile(optimizer='adamax', loss='categorical_crossentropy', metrics=['accuracy'])


In [218]:
history = model.fit(
    train_padded, y_train, epochs=20, batch_size=128, validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
model.save("models/nlp_training_model_self.h5")

### load old model

In [221]:
model_old = tf.keras.models.load_model("models/nlp_training_model_overfit_chatgpt.h5")
model_old.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding_24 (Embedding)    (None, 50, 64)               640000    ['input_3[0][0]']             
                                                                                                  
 embedding_25 (Embedding)    (None, 50, 64)               640000    ['input_4[0][0]']             
                                                                                            

In [205]:
model.evaluate(test_padded, y_test)



[2.964996814727783, 0.27649998664855957]

In [244]:
train.value_counts("emotion")

emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

## Check model accuracy with custom inputs

In [243]:
input_sentence = ["i am space"]
df = pd.DataFrame(input_sentence, columns=['sentence'])

df["sentence"] = df["sentence"].apply(remove_stopwords)
df["sentence"] = df["sentence"].apply(stem_text)

input_ = tokenize_sentences(df)
input_ = pad_sequences_with_zeros(input_, 50)

pred_val = model_old.predict([input_,input_]).argmax()
result = emotion_en_decrypted.loc[emotion_en_decrypted['emotion_en'] == pred_val, 'emotion'].values[0]


pred_val_model = model.predict(input_).argmax()
result_model = emotion_en_decrypted.loc[emotion_en_decrypted['emotion_en'] == pred_val_model, 'emotion'].values[0]
"old:",pred_val,result,"|new:",pred_val_model,result_model





('old:', 2, 'joy', '|new:', 2, 'joy')

In [None]:
check_frequency_of_words(val)