# 1. Setup & Configuração

In [None]:
# Instalação de dependências
!pip install datasets tensorflow scikit-learn pandas imbalanced-learn -q

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from datasets import load_dataset
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import class_weight
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM, Dense, Dropout,
    Conv1D, GlobalMaxPooling1D, concatenate
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Hiperparâmetros
CONFIG = {
    'MAX_WORDS': 20000, 'MAX_LEN': 50, 'EMBEDDING_DIM': 128,
    'N_SPLITS': 5, 'BATCH_SIZE': 128, 'EPOCHS': 5,
    'TARGET_SAMPLES': 5000, 'TOXICITY_THRESHOLD': 0.5
}

LABEL_COLUMNS = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

# 2. ETL & Balanceamento de Dados

In [None]:
def process_data(config, label_cols):
    print("Carregando dataset...")
    dataset = load_dataset("civil_comments", split='train').to_pandas()

    # Definição de rótulos
    dataset['non_toxic'] = (dataset[label_cols].sum(axis=1) == 0).astype(int)

    def get_label(row):
        for label in label_cols:
            if row[label] > config['TOXICITY_THRESHOLD']: return label
        return 'non_toxic'

    dataset['label'] = dataset.apply(get_label, axis=1)
    return dataset

def balance_data(df, target):
    dfs = []
    minority_data = []
    minority_labels = []

    # Undersampling
    for label in df['label'].unique():
        subset = df[df['label'] == label]
        if len(subset) >= target:
            dfs.append(subset.sample(n=target, random_state=42))
        else:
            minority_data.append(subset)
            minority_labels.append(label)

    # Oversampling
    if minority_data:
        full_minority = pd.concat(minority_data)
        ros = RandomOverSampler(sampling_strategy={l: target for l in minority_labels}, random_state=42)
        X_res, y_res = ros.fit_resample(full_minority[['text']], full_minority['label'])
        dfs.append(pd.DataFrame({'text': X_res['text'], 'label': y_res}))

    return pd.concat(dfs).sample(frac=1, random_state=42).reset_index(drop=True)

df_raw = process_data(CONFIG, LABEL_COLUMNS)
df_final = balance_data(df_raw, CONFIG['TARGET_SAMPLES'])

print(f"Dataset final: {len(df_final)} amostras.")
print(df_final['label'].value_counts())

# 3. Tokenização

In [None]:
le = LabelEncoder()
y_encoded = to_categorical(le.fit_transform(df_final['label']))

tokenizer = Tokenizer(num_words=CONFIG['MAX_WORDS'])
tokenizer.fit_on_texts(df_final['text'])
X_padded = pad_sequences(tokenizer.texts_to_sequences(df_final['text']), maxlen=CONFIG['MAX_LEN'])

# 4. Arquitetura do Modelo (Híbrido)

In [None]:
def build_model(cfg, n_classes):
    inp = Input(shape=(cfg['MAX_LEN'],))
    emb = Embedding(cfg['MAX_WORDS'], cfg['EMBEDDING_DIM'])(inp)

    # LSTM Branch
    lstm = Bidirectional(LSTM(32, return_sequences=False))(emb)

    # CNN Branch
    cnns = [GlobalMaxPooling1D()(Conv1D(32, k, activation='relu')(emb)) for k in [2, 3, 4]]
    cnn = concatenate(cnns)

    # Merge
    x = concatenate([lstm, cnn])
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    out = Dense(n_classes, activation='softmax')(x)

    model = Model(inp, out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

build_model(CONFIG, len(le.classes_)).summary()

# 5. Treinamento (Stratified K-Fold)

In [None]:
skf = StratifiedKFold(n_splits=CONFIG['N_SPLITS'], shuffle=True, random_state=42)
class_weights = dict(enumerate(class_weight.compute_class_weight(
    'balanced', classes=np.unique(le.transform(df_final['label'])), y=le.transform(df_final['label']))))

histories, f1_scores, acc_scores = [], [], []

# Listas para armazenar todos os rótulos verdadeiros e preditos de todas as folds
all_y_true = []
all_y_pred = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_padded, np.argmax(y_encoded, axis=1)), 1):
    print(f"\nFold {fold}/{CONFIG['N_SPLITS']}")

    model = build_model(CONFIG, len(le.classes_))
    history = model.fit(
        X_padded[train_idx], y_encoded[train_idx],
        validation_data=(X_padded[val_idx], y_encoded[val_idx]),
        epochs=CONFIG['EPOCHS'], batch_size=CONFIG['BATCH_SIZE'],
        callbacks=[EarlyStopping(patience=2, restore_best_weights=True)],
        class_weight=class_weights, verbose=1
    )

    histories.append(history)
    y_pred = np.argmax(model.predict(X_padded[val_idx]), axis=1)
    y_true = np.argmax(y_encoded[val_idx], axis=1)

    # Armazenar rótulos verdadeiros e preditos para avaliação geral
    all_y_true.extend(y_true)
    all_y_pred.extend(y_pred)

    f1_scores.append(f1_score(y_true, y_pred, average='macro'))
    acc_scores.append(np.mean(y_pred == y_true))
    print(f"Fold {fold} F1: {f1_scores[-1]:.4f}")

# 6. Avaliação Final

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score

def plot_cv_history(histories, f1_scores):
    max_epochs = max([len(h.history['val_loss']) for h in histories])

    val_accs = []
    val_losses = []
    for h in histories:
        acc = h.history['val_accuracy']
        loss = h.history['val_loss']
        acc_padded = np.pad(acc, (0, max_epochs - len(acc)), 'edge')
        loss_padded = np.pad(loss, (0, max_epochs - len(loss)), 'edge')
        val_accs.append(acc_padded)
        val_losses.append(loss_padded)

    val_accs = np.array(val_accs)
    val_losses = np.array(val_losses)

    acc_mean = np.mean(val_accs, axis=0)
    acc_std = np.std(val_accs, axis=0)
    loss_mean = np.mean(val_losses, axis=0)
    loss_std = np.std(val_losses, axis=0)
    epochs = range(1, max_epochs + 1)

    plt.figure(figsize=(18, 6))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc_mean, 'b-', label='Acurácia de Validação (Média)')
    plt.fill_between(epochs, acc_mean - acc_std, acc_mean + acc_std, color='blue', alpha=0.2, label='Desvio Padrão (Acurácia)')
    plt.plot(epochs, loss_mean, 'r-', label='Perda de Validação (Média)')
    plt.fill_between(epochs, loss_mean - loss_std, loss_mean + loss_std, color='red', alpha=0.2, label='Desvio Padrão (Perda)')
    plt.title('Desempenho Médio do Modelo nos Folds')
    plt.xlabel('Épocas')
    plt.ylabel('Acurácia / Perda')
    plt.legend(loc='best')
    plt.grid(True)

    plt.subplot(1, 2, 2)
    fold_labels = [f'Fold {i+1}' for i in range(len(f1_scores))]
    f1_mean = np.mean(f1_scores)
    bars = plt.bar(fold_labels, f1_scores, color='skyblue', label='F1-Score por Fold')
    plt.axhline(f1_mean, color='crimson', linestyle='--', linewidth=2, label=f'F1 Médio: {f1_mean:.4f}')
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.005, f'{yval:.4f}', ha='center', va='bottom')
    plt.title('F1-Score (Macro) por Fold de Validação')
    plt.xlabel('Fold')
    plt.ylabel('F1-Score')
    plt.ylim(top=max(f1_scores) * 1.1)
    plt.legend(loc='best')
    plt.tight_layout()
    plt.show()

print("\n--- Avaliação Geral da Cross-Validation ---")
all_y_true = np.array(all_y_true)
all_y_pred = np.array(all_y_pred)

overall_f1_macro = f1_score(all_y_true, all_y_pred, average='macro')
overall_recall_macro = recall_score(all_y_true, all_y_pred, average='macro')
overall_accuracy = accuracy_score(all_y_true, all_y_pred)

print(f"Acurácia Geral: {overall_accuracy:.4f}")
print(f"F1-Score Geral (Macro): {overall_f1_macro:.4f}")
print(f"Recall Geral (Macro): {overall_recall_macro:.4f}")

print("\nRelatório de Classificação Detalhado:")
print(classification_report(all_y_true, all_y_pred, target_names=le.classes_))

plot_cv_history(histories, f1_scores)