In [1]:
import pandas as pd
import autokeras as ak
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import pickle

# Set random seed for reproducibility
np.random.seed(42)

#Paths
data_path = "../datasets/tupi_binary.csv"
model_path = "autokeras_model.pkl"

2023-11-22 09:20:51.764905: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-22 09:20:51.765115: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-22 09:20:51.910987: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-22 09:20:52.226295: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


## Train

In [2]:
def load_and_split(test_size=0.2, random_state=42,data_path=data_path):
    # Load
    df = pd.read_csv(data_path)
    # Stratify the train-test split
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df["hate"])
    # Format
    x_train = np.array(df_train["text"])
    y_train = np.array(df_train["hate"])
    x_test = np.array(df_test["text"])
    y_test = np.array(df_test["hate"])
    
    return x_train , y_train , x_test, y_test

In [3]:
def calculate_average_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted')

In [4]:
def evaluate_model(model, x_test, y_test):
    logits = model.predict(x_test)

    # Apply sigmoid to get probabilities
    predicted_probabilities = 1 / (1 + np.exp(-logits))

    # Calculate additional metrics
    y_pred_binary = np.round(predicted_probabilities)
    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)

    # Calculate AUC
    auc = roc_auc_score(y_test, predicted_probabilities)

    # Calculate average F1 score
    average_f1 = calculate_average_f1(y_test, y_pred_binary)

    return accuracy, precision, recall, average_f1, auc

In [5]:
def save_model_as_pickle(model, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(model, f)
#
def load_model_from_pickle(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

In [6]:
def main():
    
    # Load and split
    x_train , y_train , x_test, y_test = load_and_split(data_path=data_path)
    # Initialize the text classifier.
    clf = ak.TextClassifier(overwrite=True, max_trials=10)  # Experiment with more trials

    # Feed the text classifier with training data.
    clf.fit(x_train, y_train, epochs=10)  # Experiment with more epochs

    # Evaluate the model on the training set
    print("Metrics")
    accuracy, precision, recall, average_f1, auc = evaluate_model(clf, x_train, y_train)
    print(f"Evaluation Metrics: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {average_f1}, AUC: {auc}")

    # Save the best model as a pickle file
    save_model_as_pickle(clf.export_model(), model_path)

if __name__ == "__main__":
    main()


Trial 2 Complete [00h 02m 40s]
val_loss: 0.2978411912918091

Best val_loss So Far: 0.29016774892807007
Total elapsed time: 00h 18m 42s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
bert              |vanilla           |text_block_1/block_type
0                 |0                 |classification_head_1/dropout
adam_weight_decay |adam              |optimizer
2e-05             |0.001             |learning_rate
512               |None              |text_block_1/bert_block_1/max_sequence_length

Epoch 1/10


## Evaluate

#### Load

In [None]:
#Load data
x_train , y_train , x_test, y_test = load_and_split(data_path=data_path)

In [None]:
# Load model 
final_model = load_model_from_pickle(model_path)



#### Test

In [None]:
print("Evaluation - Testing Set")
accuracy, precision, recall, average_f1, auc = evaluate_model(final_model, x_test, y_test)
print(f"Evaluation Metrics: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {average_f1}, AUC: {auc}")

Evaluation - Testing Set
Evaluation Metrics: Accuracy: 0.12033432562399816, Precision: 0.12033432562399816, Recall: 1.0, F1 Score: 0.025850051350193574, AUC: 0.8390817494306073
