In [12]:
import numpy as np
import sys
import os
import pandas as pd
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Imports do modelo
from layers import SigmoidActivation, DenseLayer
from functions import BinaryCrossEntropy, accuracy
from networks import NeuralNetwork
from optimizations import RetGradient, L2Reg

# Função auxiliar para remover pontuação (opcional)
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Ler ficheiros
train_input = pd.read_csv('data/test_input.csv', sep='\t')
train_output = pd.read_csv('data/test_output.csv', sep='\t')
validation_input = pd.read_csv('data/test_in.csv', sep='\t')
validation_output = pd.read_csv('data/teste_out.csv', sep='\t')
test_input = pd.read_csv('data/dataset2_inputs.csv', sep='\t')  # sem output

# Pré-processamento
rem_punctuation = False
if rem_punctuation:
    train_input['Text'] = train_input['Text'].apply(remove_punctuation)
    validation_input['Text'] = validation_input['Text'].apply(remove_punctuation)
    test_input['Text'] = test_input['Text'].apply(remove_punctuation)

# Extrair campos
X_train_raw = train_input['Text'].values
y_train = train_output['Label'].map(lambda x: 1 if x == 'AI' else 0).astype(np.float32).values.reshape(-1, 1)

X_validation_raw = validation_input['Text'].values
y_validation = validation_output['Label'].map(lambda x: 1 if x == 'AI' else 0).astype(np.float32).values.reshape(-1, 1)

X_test_raw = test_input['Text'].values
ids = test_input['ID'].values
y_test = None  # não fornecido

# Vetorização + normalização
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw).toarray()
X_validation = vectorizer.transform(X_validation_raw).toarray()
X_test = vectorizer.transform(X_test_raw).toarray()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

# Build model
optimizer = RetGradient(learning_rate=0.001, momentum=0.9)
loss = BinaryCrossEntropy()
regulator = L2Reg(l2_val=0.001)

model = NeuralNetwork(epochs=200, batch_size=30, optimizer=optimizer, regulator=regulator, verbose=True,
                      loss=loss, metric=accuracy, patience=20, min_delta=0.001)

print('Training set shape:', X_train.shape)
print(X_train.shape, y_train.shape, X_validation.shape, y_validation.shape)
n_features = X_train.shape[1]

model.add(DenseLayer(64, (n_features,)))
model.add(SigmoidActivation())
model.add(DenseLayer(32))
model.add(SigmoidActivation())
model.add(DenseLayer(16))
model.add(SigmoidActivation())
model.add(DenseLayer(1))
model.add(SigmoidActivation())

# Train network
model.fit(X_train, y_train, X_val=X_validation, y_val=y_validation)

# Plot learning curves
model.plot_train_curves()

# Predict test set
out = model.predict(X_test)

# Avaliação (opcional)
if y_test is not None:
    print(model.score(y_test, out))

# Guardar resultados
results_filepath = 'data/previsao-s1.csv'
os.makedirs(os.path.dirname(results_filepath), exist_ok=True)

results = pd.DataFrame({'ID': ids, 'Label': ['AI' if round(pred[0]) == 1 else 'Human' for pred in out]})
results.to_csv(results_filepath, sep='\t', index=False)


Training set shape: (3000, 15894)
(3000, 15894) (3000, 1) (1000, 15894) (1000, 1)
Epoch 1/200 - loss: 2132.5767 - accuracy: 0.4890
Epoch 2/200 - loss: 2084.8617 - accuracy: 0.4940
Epoch 3/200 - loss: 2082.4654 - accuracy: 0.4957
Epoch 4/200 - loss: 2081.2541 - accuracy: 0.5060
Epoch 5/200 - loss: 2080.1295 - accuracy: 0.5117
Epoch 6/200 - loss: 2078.0194 - accuracy: 0.5107
Epoch 7/200 - loss: 2076.0092 - accuracy: 0.5200
Epoch 8/200 - loss: 2074.5780 - accuracy: 0.5153
Epoch 9/200 - loss: 2073.0273 - accuracy: 0.5260
Epoch 10/200 - loss: 2071.2707 - accuracy: 0.5380
Epoch 11/200 - loss: 2071.2069 - accuracy: 0.5370
Epoch 12/200 - loss: 2067.7563 - accuracy: 0.5423
Epoch 13/200 - loss: 2066.6441 - accuracy: 0.5373
Epoch 14/200 - loss: 2063.9117 - accuracy: 0.5513
Epoch 15/200 - loss: 2062.4139 - accuracy: 0.5460
Epoch 16/200 - loss: 2059.1204 - accuracy: 0.5593
Epoch 17/200 - loss: 2057.5700 - accuracy: 0.5603
Epoch 18/200 - loss: 2055.1441 - accuracy: 0.5697
Epoch 19/200 - loss: 2051.7