In [15]:
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import tensorflow as tf
from keras.metrics import Precision, Recall
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.metrics import confusion_matrix
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import regularizers
from keras.layers import Embedding, Flatten, Dense
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, pos_tag, ne_chunk
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
test = pd.read_csv("datasets/test.txt", sep=";")
train = pd.read_csv("datasets/train.txt", sep=";")
val = pd.read_csv("datasets/val.txt", sep=";")

In [None]:
dir = f"{os.getcwd()}/nltk_datasets"
nltk.data.path.append(dir)
nltk.download("stopwords", download_dir=dir)
nltk.download("punkt", download_dir=dir)
nltk.download("maxent_ne_chunker", download_dir=dir)
nltk.download("words", download_dir=dir)
nltk.download("tagsets", download_dir=dir)
nltk.download("averaged_perceptron_tagger", download_dir=dir)

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return " ".join(filtered_text)


def stem_text(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)


def extract_entities(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    ne_chunks = ne_chunk(pos_tags)

    entities = []
    for chunk in ne_chunks:
        if hasattr(chunk, "label") and chunk.label():
            if chunk.label() == "NE":
                entities.append(" ".join([c[0] for c in chunk]))
    return entities


def create_tfidf_vectorizer(df):
    vectorizer = TfidfVectorizer(max_features=1000,use_idf=True)
    # Fit and transform the text data in the DataFrame column
    tfidf_matrix = vectorizer.fit_transform(df["sentence"])
    # Convert the TF-IDF matrix to a DataFrame for visualization
    return tfidf_matrix.toarray()

def check_frequency_of_words(df):
    # Combine all sentences into one string
    all_sentences = ' '.join(df['sentence'].tolist())

    # Tokenize the combined text into words
    words = all_sentences.split()

    # Create a Pandas Series to count word frequencies
    word_freq = pd.Series(words).value_counts()

    # Plot the top 20 most frequent words
    plt.figure(figsize=(10, 6))
    word_freq.head(20).plot(kind='bar', color='skyblue')
    plt.title('Top 20 Most Frequent Words in Sentences')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
train["sentence"] = train["sentence"].apply(remove_stopwords)
test["sentence"] = test["sentence"].apply(remove_stopwords)
val["sentence"] = val["sentence"].apply(remove_stopwords)

train["sentence"] = train["sentence"].apply(stem_text)
test["sentence"] = test["sentence"].apply(stem_text)
val["sentence"] = val["sentence"].apply(stem_text)

train["entities"] = train["sentence"].apply(extract_entities)
test["entities"] = test["sentence"].apply(extract_entities)
val["entities"] = val["sentence"].apply(extract_entities)

train["sentence"] = train["sentence"].str.lower()
test["sentence"] = test["sentence"].str.lower()
val["sentence"] = val["sentence"].str.lower()

train_tfidf = create_tfidf_vectorizer(train)
test_tfidf = create_tfidf_vectorizer(test)
val_tfidf = create_tfidf_vectorizer(val)
# lets do lammentization next time

In [None]:
train["emotion"] = train["emotion"].replace(
    ["anger", "fear", "joy", "love", "sadness", "surprise"], [0, 1, 2, 3, 4, 5]
)
test["emotion"] = test["emotion"].replace(
    ["anger", "fear", "joy", "love", "sadness", "surprise"], [0, 1, 2, 3, 4, 5]
)
val["emotion"] = val["emotion"].replace(
    ["anger", "fear", "joy", "love", "sadness", "surprise"], [0, 1, 2, 3, 4, 5]
)

y_train = to_categorical(train["emotion"])
y_test = to_categorical(test["emotion"])
y_val = to_categorical(val["emotion"])

In [20]:
                                 
model = Sequential([
    Embedding(input_dim=1000, output_dim=6, input_length=1000),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(16, activation='relu'),
    Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
# history = model.fit(train_tfidf,
#                     y_train, epochs=20, batch_size=32,
#                     validation_data=(val_tfidf, y_val))
history = model.fit(train_tfidf,
                    y_train, epochs=5, batch_size=512,
                    validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
model.save("models/nlp_training_model.h5")

  saving_api.save_model(


In [13]:
model_old = tf.keras.models.load_model("models/nlp_training_model.h5")
model_old.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 64)                64064     
                                                                 
 dense_3 (Dense)             (None, 6)                 390       
                                                                 
Total params: 64454 (251.77 KB)
Trainable params: 64454 (251.77 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model.evaluate(test_tfidf, y_test)



[1.559146761894226, 0.3474999964237213]

In [None]:
check_frequency_of_words(val)