In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# =====================
# 1. Setup e semente
# =====================
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# =====================
# 2. Carregar dados
# =====================
def load_dataset(file_path, sep='\t'):
    return pd.read_csv(file_path, sep=sep, encoding='utf-8')

X_train = load_dataset("data/dataset_training_input.csv")
y_train = load_dataset("data/dataset_training_output.csv")

X_val = load_dataset("data/dataset_validation_input.csv")
y_val = load_dataset("data/dataset_validation_output.csv")

X_test = load_dataset("data/dataset3_inputs.csv")
ids = X_test["ID"]

# =====================
# 3. Tokenização
# =====================
max_words = 15000
max_len = 500

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train["Text"])

def tokenize_pad(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=max_len)

X_train_pad = tokenize_pad(X_train["Text"])
X_val_pad = tokenize_pad(X_val["Text"])
X_test_pad = tokenize_pad(X_test["Text"])

# =====================
# 4. Labels
# =====================
y_train = y_train["Label"].map({"AI": 1, "Human": 0}).values
y_val = y_val["Label"].map({"AI": 1, "Human": 0}).values

# =====================
# 5. Modelo Bidirectional LSTM
# =====================
embedding_dim = 100

model = Sequential([
    Input(shape=(max_len,)),
    Embedding(max_words, embedding_dim),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# =====================
# 6. Treinar
# =====================
history = model.fit(X_train_pad, y_train,
                    epochs=6,
                    batch_size=32,
                    validation_data=(X_val_pad, y_val),
                    verbose=1)

# =====================
# 7. Prever no dataset3
# =====================
preds = model.predict(X_test_pad)
pred_labels = ["AI" if p > 0.5 else "Human" for p in preds.flatten()]

# =====================
# 8. Guardar submissão
# =====================
output_df = pd.DataFrame({
    "ID": ids,
    "Label": pred_labels
})
output_df.to_csv("data/previsao-Bidirectional-LSTM-s2.csv", sep="\t", index=False)
print("✅ Ficheiro de submissão gerado: submissao2-bilstm.csv")


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
✅ Ficheiro de submissão gerado: submissao2-bilstm.csv
