# Classificador de Texto: AI vs Human
Este notebook treina um modelo para distinguir textos gerados por inteligência artificial e humanos.

In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers

## 1. Definir semente para garantir reprodutibilidade

In [None]:
os.environ['TF_DETERMINISTIC_OPS'] = '1'
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

## 2. Carregar os dados de treino, validação e teste

In [None]:
def load_dataset(path, sep='\t'):
    return pd.read_csv(path, sep=sep, encoding='utf-8')

X_train = load_dataset("data/test_input.csv")
y_train = load_dataset("data/test_output.csv")

X_val = load_dataset("data/human_ai_input.csv")
y_val = load_dataset("data/human_ai_output.csv")

X_test = load_dataset("data/dataset3_inputs.csv")
ids = X_test["ID"]

## 3. Tokenização dos textos para vetorizar as frases

In [None]:
max_words = 20000
max_len = 500

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train["Text"])

def tokenize_pad(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=max_len)

X_train_pad = tokenize_pad(X_train["Text"])
X_val_pad = tokenize_pad(X_val["Text"])
X_test_pad = tokenize_pad(X_test["Text"])

## 4. Converter rótulos para 0 (Human) e 1 (AI)

In [None]:
y_train = y_train["Label"].map({"AI": 1, "Human": 0}).values
y_val = y_val["Label"].map({"AI": 1, "Human": 0}).values

## 5. Criar o modelo de rede neural

In [None]:
embedding_dim = 128
model = Sequential([
    Input((max_len,)),
    Embedding(max_words, embedding_dim, embeddings_initializer=initializers.GlorotUniform(seed=44)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## 6. Treinar o modelo

In [None]:
history = model.fit(X_train_pad, y_train, epochs=15, batch_size=64,
                    validation_data=(X_val_pad, y_val), verbose=1)

## 7. Fazer previsões no conjunto de teste

In [None]:
preds = model.predict(X_test_pad)
pred_labels = ["AI" if p > 0.5 else "Human" for p in preds.flatten()]

## 8. Exportar os resultados

In [None]:
output_df = pd.DataFrame({
    "ID": ids,
    "Label": pred_labels
})
output_df.to_csv("data/submissao2-grupo5-s1.csv", sep='\t', index=False)
print("Ficheiro de submissão gerado: submissao2-grupo5-s1.csv")