In [30]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from src.cleanText import cleanText
import re

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D


def preprocess_data(data_path : str) -> dict:
    """
    Preprocesses the data by cleaning the text and encoding the labels. 

    Args:
        data_path (str): Path to the data folder

    Returns:
        dict: Dictionary with the processed data
    """
    processed_data = {} 
    files = os.listdir(data_path)

    for file in files:
        if "Hate" in file:
            df = pd.read_csv(os.path.join(data_path, file))
            df = df.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
            df = df.rename(columns={'tweet': 'text', 'class': 'label'})
            df["text"] = df["text"].apply(lambda text: cleanText(re.sub(r"RT @\w+:", "", text)))
            le = LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        elif "Sarcasm" in file:
            df = pd.read_csv(os.path.join(data_path, file))
            df = df.rename(columns={'Tweet': 'text', 'Label': 'label'})
            df['text'] = df['text'].apply(cleanText) 
            df['text'] = df['text'].apply(lambda text: cleanText(re.sub(r"user ", " ", text)))
            le = LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        elif "Stress" in file:
            df = pd.read_csv(os.path.join(data_path, file))
            df = df.drop(columns=['subreddit', 'post_id', 'sentence_range', 'syntax_fk_grade', 'Stress Level'], axis=1)
            df = df.rename(columns={'text': 'text', 'label': 'label'})
            df['text'] = df['text'].apply(cleanText)
            le = LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        elif "Spam" in file:
            df = pd.read_csv(os.path.join(data_path, file))
            df = df.rename(columns={'v2': 'text', 'v1': 'label'})
            df['text'] = df['text'].apply(cleanText)
            le = LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        elif "Sentiment" in file:
            df = pd.read_csv(os.path.join(data_path, file))
            df['text'] = df['text'].apply(cleanText)
            le = LabelEncoder()
            df['label'] = le.fit_transform(df['label'])
        processed_data[file] = df  

    return processed_data  

data_path = "../data"
processed_data = preprocess_data(data_path)


In [40]:
def prepare_data(
            data : object, 
            num_classes=2 : int,
            ) -> tuple:
    """
    Prepares the data for training and testing.

    Args:
        data (object): The data to prepare.
        num_classes (int, optional): The number of classes. Defaults to 2.

    Returns:
        tuple: The training and testing data and labels.
    """
    tokenizer = Tokenizer(num_words=10000, split=' ')
    tokenizer.fit_on_texts(data['text'].values)
    X = tokenizer.texts_to_sequences(data['text'].values)
    X = pad_sequences(X, maxlen=100)
    
    if num_classes == 2:
        y = data['label'].values
    else:
        y = pd.get_dummies(data['label']).values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

    return X_train, X_test, y_train, y_test


def model_architecture(
                num_classes : int,
                max_features=10000 : int,
                embedding_dim=128 : int,
                lstm_units=128 : int,
                dropout_rate=0.2 : float,
                ) -> object:
    """
    Creates the model architecture.

    Args:
        num_classes (int): The number of classes.
        max_features (int, optional): The maximum number of features. Defaults to 10000.
        embedding_dim (int, optional): The embedding dimension. Defaults to 128.
        lstm_units (int, optional): The LSTM units. Defaults to 128.
        dropout_rate (float, optional): The dropout rate. Defaults to 0.2.

    Returns:
        object: The model architecture.
    """
    model = Sequential()
    model.add(Embedding(max_features, embedding_dim, input_length=100))
    model.add(SpatialDropout1D(dropout_rate))  
    model.add(Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate)))
    model.add(LSTM(lstm_units, dropout=dropout_rate, recurrent_dropout=dropout_rate))
    model.add(Dense(64, activation='relu'))
    
    if num_classes == 2:
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        model.add(Dense(num_classes, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def train_model(
            model : object,
            X_train : list,
            X_test : list, 
            y_train : list,
            y_test : list,
            filename : str,
            ) -> None:
    """
    Trains the model and saves it to the models folder, and prints the accuracy on the test set.

    Args:
        model (object): The model to train.
        X_train (list): The training data.
        X_test (list): The test data.
        y_train (list): The training labels.
        y_test (list): The test labels.
        filename (str): The name of the model.

    Returns:
        None    
    """
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)
    model.save(f"../models/{filename}.h5")
    model.evaluate(X_test, y_test)
    print("Final accuracy on test set for " + filename + ": " + str(model.evaluate(X_test, y_test)[1]))



In [41]:
for key in processed_data:
    if "Hate" in key:
        num_classes = 3
        X_train, X_test, y_train, y_test = prepare_data(processed_data[key], num_classes=3)
        model = model_architecture(num_classes)
        train_model(model, X_train, X_test, y_train, y_test, key.split(".")[0])
    else:
        num_classes = 2
        X_train, X_test, y_train, y_test = prepare_data(processed_data[key], num_classes=2)
        model = model_architecture(num_classes)
        train_model(model, X_train, X_test, y_train, y_test, key.split(".")[0])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final accuracy on test set for Sentiment Analysis: 0.7770000100135803
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final accuracy on test set for Sarcasm Detection: 0.9198682904243469
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final accuracy on test set for Stress Detection: 0.7183098793029785
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final accuracy on test set for Hate Content Detection: 0.8852128386497498
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final accuracy on test set for Spam Detection: 0.976457417011261
