In [34]:
import pandas as pd
import autokeras as ak
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

datasets_folder = "../datasets"
file_name = "tupi_binary.csv"


def read_csv_file(file_name):
    file_path = os.path.join(datasets_folder, file_name)
    if os.path.isfile(file_path):
        return pd.read_csv(file_path)
    else:
        raise FileNotFoundError(f"File not found: {file_path}")


def train_test_split_data(df, test_size=0.2, random_state=42):
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state)
    return df_train, df_test


def calculate_average_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted')


def main():
    # Set random seed for reproducibility
    np.random.seed(42)

    # Load data
    df = read_csv_file(file_name)

    # Split the DataFrame into training and testing sets
    df_train, df_test = train_test_split_data(df)

    x_train = np.array(df_train["text"])
    y_train = np.array(df_train["hate"])
    x_test = np.array(df_test["text"])
    y_test = np.array(df_test["hate"])

    # Initialize the text classifier.
    clf = ak.TextClassifier(overwrite=True, max_trials=2)  # Experiment with more trials

    # Feed the text classifier with training data.
    clf.fit(x_train, y_train, epochs=1)  # Experiment with more epochs

    # Predict with the best model.
    logits = clf.predict(x_test)

    # Apply sigmoid to get probabilities
    predicted_probabilities = 1 / (1 + np.exp(-logits))

    # Calculate additional metrics
    accuracy = accuracy_score(y_test, np.round(predicted_probabilities))
    precision = precision_score(y_test, np.round(predicted_probabilities))
    recall = recall_score(y_test, np.round(predicted_probabilities))

    # Calculate AUC
    auc = roc_auc_score(y_test, predicted_probabilities)

    # Calculate average F1 score
    average_f1 = calculate_average_f1(y_test, np.round(predicted_probabilities))

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {average_f1}")
    print(f"AUC: {auc}")

In [35]:
main()

Trial 2 Complete [00h 00m 23s]
val_loss: 0.3298386037349701

Best val_loss So Far: 0.27215877175331116
Total elapsed time: 00h 02m 02s
INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets


INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets


Evaluation Metrics:
Accuracy: 0.8819555759102359
Precision: 0.5355850422195416
Recall: 0.4073394495412844
F1 Score: 0.8749187888849355
AUC: 0.6784865745874921


In [37]:
# Print model summary
print("Model Summary:")
clf.export_model().summary()

Model Summary:
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 expand_last_dim (ExpandLas  (None, 1)                 0         
 tDim)                                                           
                                                                 
 text_vectorization (TextVe  (None, 512)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 512, 64)           320064    
                                                                 
 dropout (Dropout)           (None, 512, 64)           0         
                                                                 
 conv1d (Conv1D)             (None, 508, 256) 

In [38]:
# Retrieve the best model
best_model = clf.export_model()

# Print the summary of the best model
print("Best Model Summary:")
best_model.summary()

Best Model Summary:
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 expand_last_dim (ExpandLas  (None, 1)                 0         
 tDim)                                                           
                                                                 
 text_vectorization (TextVe  (None, 512)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 512, 64)           320064    
                                                                 
 dropout (Dropout)           (None, 512, 64)           0         
                                                                 
 conv1d (Conv1D)             (None, 508, 