In [33]:
import numpy as np
import pandas as pd
import os
import random

def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [34]:
import numpy as np
import pandas as pd
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Input, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split


# Load dos dados
csv_path = '../../datasets/human_or_ai_dataset_small.csv'  # Change this to your file path
df = pd.read_csv(csv_path)
# Sanity check!
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

Dataset shape: (5051, 2)
Columns: Index(['text', 'source'], dtype='object')


In [35]:
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# Parameters
max_length = 600
max_tokens = 20000

# Extract texts and labels
texts = df['text'].values
labels = df['source'].values

# Convert labels to numeric values
label_map = {'human': 0, 'ai': 1}
y_data = np.array([label_map[label] for label in labels])

# Define TextVectorization layer
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

# Adapt to the text dataset
text_vectorization.adapt(texts)

# Transform text data into tokenized sequences
x_data = text_vectorization(texts).numpy()  # Convert TensorFlow tensor to NumPy array

# Split data
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Check shapes
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Print a sample
print("Sample sequence:", x_train[0])
print("Sample label:", y_train[0])


x_train shape: (4040, 600)
y_train shape: (4040,)
x_test shape: (1011, 600)
y_test shape: (1011,)
Sample sequence: [   15    20     2   680     3  4751   121  6079  3152     6     2   624
  1302   524     6     5   216   432     3     1  1533     1  3011   104
    15   954     2  2998   505 18556    23  5046  6078     6    38   104
     4    56     9   121  6078     8   224    10     2   583   144  8081
  3011    44    15   129     9 18556  7232    91     2   627   378     8
   150  3530    15    56     9   121  6078     8    31   224     6     5
  6757    44    11    24   489   463     7   246    23  1891     6   627
    47    67     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   

In [36]:
import tensorflow as tf
import numpy as np

# Set parameters
batch_size = 16
val_fraction = 0.15
seed = 25

# Shuffle the entire dataset with a fixed seed
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
dataset = dataset.shuffle(buffer_size=len(x_train), seed=seed)

# Calculate split sizes
val_size = int(len(x_train) * val_fraction)
train_size = len(x_train) - val_size

# Create training and validation datasets
train_ds = dataset.skip(val_size).batch(batch_size)
val_ds = dataset.take(val_size).batch(batch_size)

# Create test dataset
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size)

train_ds = dataset.skip(val_size).batch(batch_size)

# Transforme com Encoder

In [37]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [38]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

#    def compute_mask(self, inputs, mask=None):
#        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [39]:
vocab_size = 20000
sequence_length = 600
embed_dim = 256
num_heads = 2
dense_dim = 32

# Early stopping 
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("full_transformer_encoder.h5",
                                    save_best_only=True),
                                   early_stopping
]
model.fit(train_ds, validation_data=val_ds, epochs=20, callbacks=callbacks)
model = keras.models.load_model(
    "full_transformer_encoder.keras",
    custom_objects={"TransformerEncoder": TransformerEncoder,
                    "PositionalEmbedding": PositionalEmbedding})
print(f"Test acc: {model.evaluate(test_ds)[1]:.3f}")

Epoch 1/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step - accuracy: 0.6576 - loss: 0.8759



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 409ms/step - accuracy: 0.6581 - loss: 0.8742 - val_accuracy: 0.8960 - val_loss: 0.2026
Epoch 2/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 399ms/step - accuracy: 0.8859 - loss: 0.2757



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 425ms/step - accuracy: 0.8859 - loss: 0.2756 - val_accuracy: 0.9670 - val_loss: 0.0989
Epoch 3/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 430ms/step - accuracy: 0.9386 - loss: 0.1595 - val_accuracy: 0.9422 - val_loss: 0.1242
Epoch 4/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 409ms/step - accuracy: 0.9560 - loss: 0.1147



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 435ms/step - accuracy: 0.9560 - loss: 0.1147 - val_accuracy: 0.9719 - val_loss: 0.0847
Epoch 5/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 421ms/step - accuracy: 0.9566 - loss: 0.1061



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 446ms/step - accuracy: 0.9566 - loss: 0.1061 - val_accuracy: 0.9818 - val_loss: 0.0558
Epoch 6/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 453ms/step - accuracy: 0.9644 - loss: 0.0877 - val_accuracy: 0.9736 - val_loss: 0.0873
Epoch 7/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 454ms/step - accuracy: 0.9762 - loss: 0.0747 - val_accuracy: 0.9109 - val_loss: 0.2010
Epoch 8/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 421ms/step - accuracy: 0.9721 - loss: 0.0766



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 448ms/step - accuracy: 0.9722 - loss: 0.0766 - val_accuracy: 0.9901 - val_loss: 0.0270
Epoch 9/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 447ms/step - accuracy: 0.9858 - loss: 0.0369 - val_accuracy: 0.9901 - val_loss: 0.0360
Epoch 10/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426ms/step - accuracy: 0.9886 - loss: 0.0361



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 452ms/step - accuracy: 0.9886 - loss: 0.0361 - val_accuracy: 0.9934 - val_loss: 0.0179
Epoch 11/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step - accuracy: 0.9871 - loss: 0.0297



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 446ms/step - accuracy: 0.9871 - loss: 0.0297 - val_accuracy: 1.0000 - val_loss: 0.0042
Epoch 12/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 449ms/step - accuracy: 0.9890 - loss: 0.0353 - val_accuracy: 1.0000 - val_loss: 0.0048
Epoch 13/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 450ms/step - accuracy: 0.9926 - loss: 0.0215 - val_accuracy: 0.9917 - val_loss: 0.0284
Epoch 14/20
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428ms/step - accuracy: 0.9970 - loss: 0.0128



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 454ms/step - accuracy: 0.9969 - loss: 0.0128 - val_accuracy: 1.0000 - val_loss: 0.0013




[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 151ms/step - accuracy: 0.9437 - loss: 0.1347
Test acc: 0.949


In [40]:
print(f"Test acc: {model.evaluate(test_ds)[1]:.3f}")

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 144ms/step - accuracy: 0.9437 - loss: 0.1347
Test acc: 0.949
