In [6]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers
from sklearn.metrics import accuracy_score
from itertools import product

# ========= 1. Setup =========
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(42)
tf.random.set_seed(42)

# ========= 2. Carregar dados =========
def load_dataset(path, sep='\t'):
    df = pd.read_csv(path, sep=sep, encoding='utf-8')
    df.columns = df.columns.str.strip()
    return df

X_train = load_dataset("data/dataset_training_input.csv")
y_train = load_dataset("data/dataset_training_output.csv")
X_val = load_dataset("data/dataset_validation_input.csv")
y_val = load_dataset("data/dataset_validation_output.csv")
X_test = load_dataset("data/dataset3_inputs.csv")
ids_test = X_test["ID"]

df_outputs = load_dataset("data/outputs3_reais.csv", sep=';')
y_true = df_outputs["Label"].map({"AI": 1, "Human": 0}).values

# ========= 3. Tokenização =========
max_words = 20000
max_len = 500

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train["Text"])

def tokenize_pad(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=max_len)

X_train_pad = tokenize_pad(X_train["Text"])
X_val_pad = tokenize_pad(X_val["Text"])
X_test_pad = tokenize_pad(X_test["Text"])

y_train = y_train["Label"].map({"AI": 1, "Human": 0}).values
y_val = y_val["Label"].map({"AI": 1, "Human": 0}).values

# ========= 4. Hiperparâmetros a testar (reduzidos) =========
embedding_dims = [100, 128]
dropouts = [0.3, 0.4]
dense_units = [(64, 32), (128, 64)]
batch_sizes = [32]

# epochs fixo
epochs = 15

combos = list(product(embedding_dims, dropouts, dense_units, batch_sizes))

# ========= 5. Testar combinações =========
melhor_acc = 0
melhor_combo = None

for emb_dim, dropout_rate, (dense1, dense2), batch_size in combos:
    print(f"\n🔍 Testar: embed={emb_dim}, drop={dropout_rate}, dense=({dense1},{dense2}), batch={batch_size}, epochs={epochs}")

    model = Sequential([
        Input((max_len,)),
        Embedding(max_words, emb_dim, embeddings_initializer=initializers.GlorotUniform(seed=44)),
        Flatten(),
        Dense(dense1, activation='relu'),
        Dropout(dropout_rate),
        Dense(dense2, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size,
              validation_data=(X_val_pad, y_val), verbose=0)

    preds = model.predict(X_test_pad)
    pred_labels = (preds.flatten() > 0.5).astype(int)

    acc = accuracy_score(y_true, pred_labels)
    print(f"🎯 Accuracy vs dados reais: {acc:.4f}")

    if acc > melhor_acc:
        melhor_acc = acc
        melhor_combo = (emb_dim, dropout_rate, dense1, dense2, batch_size)

# ========= 6. Mostrar resultado final =========
print("\n✅ MELHOR COMBINAÇÃO ENCONTRADA:")
print(f"🔧 embed={melhor_combo[0]}, dropout={melhor_combo[1]}, dense=({melhor_combo[2]},{melhor_combo[3]}), "
      f"batch={melhor_combo[4]}, epochs={epochs}")
print(f"✅ Accuracy: {melhor_acc:.4f}")



🔍 Testar: embed=100, drop=0.3, dense=(64,32), batch=32, epochs=15
🎯 Accuracy vs dados reais: 0.8200

🔍 Testar: embed=100, drop=0.3, dense=(128,64), batch=32, epochs=15
🎯 Accuracy vs dados reais: 0.8200

🔍 Testar: embed=100, drop=0.4, dense=(64,32), batch=32, epochs=15
🎯 Accuracy vs dados reais: 0.7600

🔍 Testar: embed=100, drop=0.4, dense=(128,64), batch=32, epochs=15
🎯 Accuracy vs dados reais: 0.7600

🔍 Testar: embed=128, drop=0.3, dense=(64,32), batch=32, epochs=15
🎯 Accuracy vs dados reais: 0.7600

🔍 Testar: embed=128, drop=0.3, dense=(128,64), batch=32, epochs=15
🎯 Accuracy vs dados reais: 0.7700

🔍 Testar: embed=128, drop=0.4, dense=(64,32), batch=32, epochs=15


KeyboardInterrupt: 