In [1]:
import numpy as np
import pandas as pd
import os
import random

def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

# Making our own rnn

In [9]:
import numpy as np
import pandas as pd
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Input, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

# Parameters
max_features = 10000  # Tamanho do vocabulário
maxlen = 120  # Tamanho máximo das sequências

# Load dos dados
csv_path = '../../human_or_ai_dataset_small.csv' 
df = pd.read_csv(csv_path)

# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

Dataset shape: (5051, 2)
Columns: Index(['text', 'source'], dtype='object')


In [10]:
# Separar os textos das labels
texts = df['text'].values
labels = df['source'].values

# Criar um tokenizer
tokenizer = preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)

# Converter os textos para sequências de inteiros
sequences = tokenizer.texts_to_sequences(texts)

# Padding para uniformizar tamanhos
x_data = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Garantir label numérica
if not np.issubdtype(labels.dtype, np.number):
    label_map = {label: i for i, label in enumerate(np.unique(labels))}
    y_data = np.array([label_map[label] for label in labels])
else:
    y_data = labels

# Data split !
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Check shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Print a sample
print("Sample sequence:", x_train[0])
print("Sample label:", y_train[0])

x_train shape: (4040, 120)
y_train shape: (4040,)
x_test shape: (1011, 120)
y_test shape: (1011,)
Sample sequence: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   14   19    1  738    2 4758
  109 3150    5    1  619 1381  564    5    4  220  428    2  187   45
 2567  724 3261  111   14 1026    1 3273  482 1822   22 5021    5   38
  111    3   57    8  109    7  244    9    1  623  158  803 7358 3261
   44   14  143    8 1822 7520   96    1  531  373    7  130 3668   14
   57    8  109    7   30  244    5    4 6789   44   10   23  433  472
    6  266   22 1935    5  531   45   66]
Sample label: 1


## Define and train the model

In [11]:
model = Sequential()

model.add(Input((maxlen,))) 
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(SimpleRNN(128, dropout=0.2, recurrent_dropout=0.2))  
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  

model.summary()



In [None]:
# Compilar o modelo
model.compile(
    optimizer='adam',  
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Salvar o melhor modelo
model_checkpoint = ModelCheckpoint(
    'best_model_rnn.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Treinar o modelo com callbacks
history = model.fit(
    x_train, y_train,
    epochs=10, 
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Avaliar no conjunto de teste
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Accuracy no teste: {test_acc:.4f}")

Epoch 1/10
[1m25/26[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 38ms/step - accuracy: 0.5300 - loss: 0.7013



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 53ms/step - accuracy: 0.5298 - loss: 0.7012 - val_accuracy: 0.5693 - val_loss: 0.6852
Epoch 2/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.5566 - loss: 0.6851 - val_accuracy: 0.5681 - val_loss: 0.6718
Epoch 3/10
[1m25/26[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 38ms/step - accuracy: 0.5855 - loss: 0.6686



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.5853 - loss: 0.6689 - val_accuracy: 0.6658 - val_loss: 0.6635
Epoch 4/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.5918 - loss: 0.6658 - val_accuracy: 0.6151 - val_loss: 0.6363
Epoch 5/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.6477 - loss: 0.6366 - val_accuracy: 0.5644 - val_loss: 0.6228
Epoch 6/10
[1m25/26[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 39ms/step - accuracy: 0.7000 - loss: 0.5851



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.6998 - loss: 0.5848 - val_accuracy: 0.7042 - val_loss: 0.5731
Epoch 7/10
[1m25/26[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 38ms/step - accuracy: 0.7470 - loss: 0.5189



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.7477 - loss: 0.5178 - val_accuracy: 0.7153 - val_loss: 0.5452
Epoch 8/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.8257 - loss: 0.4060 - val_accuracy: 0.7067 - val_loss: 0.5755
Epoch 9/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.8674 - loss: 0.3436 - val_accuracy: 0.6906 - val_loss: 0.6136
Epoch 10/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.8975 - loss: 0.2749 - val_accuracy: 0.6770 - val_loss: 0.6865
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6604 - loss: 0.6101
Acurácia no teste: 0.6785


### LSMT

In [14]:
from tensorflow.keras.layers import LSTM

model = Sequential()

model.add(Input((maxlen,))) 
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  

model.summary()

In [15]:
# Compilar o modelo
model.compile(
    optimizer='adam',  
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Salvar o melhor modelo
model_checkpoint = ModelCheckpoint(
    'best_model_rnn.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Treinar o modelo com callbacks
history = model.fit(
    x_train, y_train,
    epochs=10, 
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Avaliar no conjunto de teste
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Accuracy no teste: {test_acc:.4f}")

Epoch 1/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - accuracy: 0.5763 - loss: 0.6610



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 136ms/step - accuracy: 0.5798 - loss: 0.6584 - val_accuracy: 0.8911 - val_loss: 0.3578
Epoch 2/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.9242 - loss: 0.2535



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 124ms/step - accuracy: 0.9244 - loss: 0.2521 - val_accuracy: 0.9257 - val_loss: 0.1857
Epoch 3/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.9654 - loss: 0.1117



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 118ms/step - accuracy: 0.9656 - loss: 0.1114 - val_accuracy: 0.9319 - val_loss: 0.1878
Epoch 4/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 137ms/step - accuracy: 0.9839 - loss: 0.0528 - val_accuracy: 0.9319 - val_loss: 0.2323
Epoch 5/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 118ms/step - accuracy: 0.9893 - loss: 0.0326 - val_accuracy: 0.9319 - val_loss: 0.2316
Epoch 6/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.9948 - loss: 0.0203



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 120ms/step - accuracy: 0.9947 - loss: 0.0204 - val_accuracy: 0.9332 - val_loss: 0.2395
Epoch 7/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 122ms/step - accuracy: 0.9935 - loss: 0.0242 - val_accuracy: 0.9282 - val_loss: 0.2281
Epoch 8/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 119ms/step - accuracy: 0.9944 - loss: 0.0216 - val_accuracy: 0.9307 - val_loss: 0.2382
Epoch 9/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - accuracy: 0.9965 - loss: 0.0147 - val_accuracy: 0.9208 - val_loss: 0.2911
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9468 - loss: 0.2289
Accuracy no teste: 0.9397


### GRU

In [16]:
from tensorflow.keras.layers import GRU

model = Sequential()

model.add(Input((maxlen,))) 
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))  
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  

model.summary()



In [17]:
# Compilar o modelo
model.compile(
    optimizer='adam',  
    loss='binary_crossentropy',  
    metrics=['accuracy']
)

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Salvar o melhor modelo
model_checkpoint = ModelCheckpoint(
    'best_model_rnn.h5',
    monitor='val_accuracy',
    save_best_only=True
)

# Treinar o modelo com callbacks
history = model.fit(
    x_train, y_train,
    epochs=10, 
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint]
)

# Avaliar no conjunto de teste
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Accuracy no teste: {test_acc:.4f}")

Epoch 1/10
[1m25/26[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 101ms/step - accuracy: 0.5694 - loss: 0.6770



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 121ms/step - accuracy: 0.5722 - loss: 0.6744 - val_accuracy: 0.8156 - val_loss: 0.4522
Epoch 2/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.8730 - loss: 0.3275



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 103ms/step - accuracy: 0.8735 - loss: 0.3261 - val_accuracy: 0.8911 - val_loss: 0.2447
Epoch 3/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.9537 - loss: 0.1312



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 113ms/step - accuracy: 0.9536 - loss: 0.1313 - val_accuracy: 0.9047 - val_loss: 0.2228
Epoch 4/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 111ms/step - accuracy: 0.9782 - loss: 0.0614 - val_accuracy: 0.8936 - val_loss: 0.2748
Epoch 5/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.9866 - loss: 0.0408



[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 123ms/step - accuracy: 0.9866 - loss: 0.0407 - val_accuracy: 0.9146 - val_loss: 0.2880
Epoch 6/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 105ms/step - accuracy: 0.9935 - loss: 0.0239 - val_accuracy: 0.9059 - val_loss: 0.3241
Epoch 7/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 110ms/step - accuracy: 0.9949 - loss: 0.0165 - val_accuracy: 0.9097 - val_loss: 0.3125
Epoch 8/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 98ms/step - accuracy: 0.9939 - loss: 0.0178 - val_accuracy: 0.9146 - val_loss: 0.3276
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9237 - loss: 0.2841
Accuracy no teste: 0.9199


# Predicting for the Competition

In [None]:
## NAO FEITO

# Carregar o CSV com dados para prever
prediction_csv_path = '../../dataset1_inputs.csv'
df_predict = pd.read_csv(prediction_csv_path, sep="\t")

# Verificar os dados carregados
print("Prediction dataset shape:", df_predict.shape)
print("Columns:", df_predict.columns)
print("Sample IDs:", df_predict['ID'].head())

# Pré-processar os dados de texto para corresponder ao formato de treinamento
# Converter textos para sequências
sequences = tokenizer.texts_to_sequences(df_predict['Text'].values)

# Padding das sequências para o mesmo tamanho usado no treinamento
x_predict = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

# Fazer previsões com o modelo treinado
predictions = model.predict(x_predict)

# Converter probabilidades para labels binários (0 = Human, 1 = AI)
# Usando 0.5 como threshold - você pode ajustar isso conforme necessário
labels = (predictions > 0.5).astype(int)

# Mapear labels para "AI" e "Human"
label_mapping = {1: "AI", 0: "Human"}
labels_mapped = [label_mapping[label] for label in labels.flatten()]

# Criar um DataFrame com os resultados
results_df = pd.DataFrame({
    'ID': df_predict['ID'],
    'Label': labels_mapped
})

# Exibir uma amostra dos resultados
print("\nAmostra dos resultados de previsão:")
print(results_df.head())

# Salvar em CSV
output_csv_path = 'prediction_results.csv'
results_df.to_csv(output_csv_path, sep="\t", index=False)
print(f"\nResultados salvos em {output_csv_path}")

Prediction dataset shape: (30, 2)
Columns: Index(['ID', 'Text'], dtype='object')
Sample IDs: 0    D1-1
1    D1-2
2    D1-3
3    D1-4
4    D1-5
Name: ID, dtype: object
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step

Sample of prediction results:
     ID  Label
0  D1-1  Human
1  D1-2  Human
2  D1-3     AI
3  D1-4     AI
4  D1-5  Human

Results saved to prediction_results.csv
