In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import initializers

# =====================
# 1. Sementes e setup
# =====================
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# =====================
# 2. Carregar dados
# =====================
def load_dataset(file_path, sep='\t'):
    return pd.read_csv(file_path, sep=sep, encoding='utf-8')

X_train = load_dataset("../data/test_input.csv")
y_train = load_dataset("../data/test_output.csv")

X_val = load_dataset("../data/human_ai_input.csv")
y_val = load_dataset("../data/human_ai_output.csv")

X_test = load_dataset("../data/dataset3_inputs.csv")
ids = X_test["ID"]

# =====================
# 3. Tokenização
# =====================
max_words = 15000
max_len = 500

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train["Text"])

def tokenize_pad(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return preprocessing.sequence.pad_sequences(seqs, maxlen=max_len)

X_train_pad = tokenize_pad(X_train["Text"])
X_val_pad = tokenize_pad(X_val["Text"])
X_test_pad = tokenize_pad(X_test["Text"])

# =====================
# 4. Preparar labels
# =====================
y_train = y_train["Label"].map({"AI": 1, "Human": 0}).values
y_val = y_val["Label"].map({"AI": 1, "Human": 0}).values

# =====================
# 5. Criar modelo RNN (LSTM)
# =====================
embedding_dim = 100

model = Sequential([
    Input(shape=(max_len,)),
    Embedding(input_dim=max_words, output_dim=embedding_dim,
              embeddings_initializer=initializers.GlorotUniform(seed=44)),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# =====================
# 6. Treinar
# =====================
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32,
                    validation_data=(X_val_pad, y_val), verbose=1)

# =====================
# 7. Prever para dataset3
# =====================
preds = model.predict(X_test_pad)
pred_labels = ["AI" if p > 0.5 else "Human" for p in preds.flatten()]

# =====================
# 8. Exportar resultados
# =====================
output_df = pd.DataFrame({
    "ID": ids,
    "Label": pred_labels
})

output_df.to_csv("../data/previsao-rnn-s2.csv", sep="\t", index=False)
print("✅ Ficheiro de submissão gerado: submissao2-grupo007-rnn.csv")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
✅ Ficheiro de submissão gerado: submissao2-grupo007-rnn.csv
