# Aprendizem Profunda
### Tarefa III
1. Rúben Gonçalo Araújo da Silva pg57900   
2. José Luis Fraga Costa pg55970
3. Pedro Miguel Costa Azevedo pg57897
4. Rui Pedro Fernandes Madeira Pinto pg56010

# Implementação

### imports

1. pandas
2. tensorflow
3. sklearn

In [21]:
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import GRU
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline
from tensorflow.keras.layers import LSTM


### Input dos dados

In [22]:
df = pd.read_csv("data/combined_dataset_treino.csv")
test_ids = df["ID"].fillna("")
test_texts = df["Text"]

### Tokenizer

In [23]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["Text"])  
sequences = tokenizer.texts_to_sequences(df["Text"])
padded_sequences = pad_sequences(sequences, padding="post")
df["Label"] = df["Label"].map({"Human": 0, "AI": 1}) 

### Train Test Split (divisão de dados)

In [24]:
X_temp, X_test, y_temp, y_test, id_temp, test_ids = train_test_split(
    padded_sequences, df["Label"], df["ID"], test_size=0.15, random_state=42
)
X_train, X_val, y_train, y_val, id_train, id_val = train_test_split(
    X_temp, y_temp, id_temp, test_size=0.1765, random_state=42
)

### Early Stopping
(atualmente só no DNN e RNN)

In [25]:
early_stopping = EarlyStopping(
    monitor='val_loss',     
    patience=10,            
    restore_best_weights=True,  
    mode='min'               
)

## DNN

In [26]:

model_dnn = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=X_train.shape[1]),
    Flatten(),
    Dense(32, activation="relu"),
    Dropout(0.5),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
])

model_dnn.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model_dnn.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test), batch_size=32, callbacks=[early_stopping])




KeyboardInterrupt: 

## RNN

In [None]:
model_rnn = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=X_train.shape[1]),
    SimpleRNN(64, return_sequences=True),
    SimpleRNN(32),
    Dense(1, activation="sigmoid")
])

model_rnn.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model_rnn.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), batch_size=32, callbacks=[early_stopping])


## LSTM

In [None]:
model_lstm = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=X_train.shape[1]),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(1, activation="sigmoid")
])

model_lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model_lstm.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)


## GRU

In [None]:
model_gru = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=X_train.shape[1]),
    GRU(64, return_sequences=True),
    GRU(32),
    Dense(1, activation="sigmoid")
])


model_gru.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model_gru.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

## Bert

In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_bert = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Pré-processar o texto
def encode_texts(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="tf")

train_encodings = encode_texts(df["Text"].tolist())
train_labels = df["Label"].values

# Treinar o modelo
model_bert.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model_bert.fit(train_encodings["input_ids"], train_labels, epochs=3, batch_size=8, validation_split=0.2)

# Correr Modelos

**DNN**

In [None]:
y_pred_dnn = (model_dnn.predict(X_test) > 0.5).astype("int32")
labels_dnn = ["AI" if pred == 1 else "Human" for pred in y_pred_dnn.flatten()]
results_df = pd.DataFrame({
    "ID": test_ids,
    "Label": labels_dnn
})
# Combine ID and Label with a space
results_df = results_df.sort_values("ID")  # Sort by ID
results_df["combined"] = results_df["ID"] + " " + results_df["Label"]
results_df[["combined"]].to_csv("submission_dnn.csv", index=False, header=False, quoting=csv.QUOTE_MINIMAL)
print("Predictions saved to 'submission_dnn.csv'")

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Predictions saved to 'submission_dnn.csv'


**RNN**

In [None]:
y_pred_rnn = (model_rnn.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred_rnn))

labels_rnn = ["IA" if pred > 0.5 else "Human" for pred in y_pred_rnn.flatten()]
results_df = pd.DataFrame({
    "ID": test_ids,
    "Label": labels_rnn
})

results_df = results_df.sort_values("ID")  # Sort by ID
results_df["combined"] = results_df["ID"] + " " + results_df["Label"]
results_df[["combined"]].to_csv("submission_rnn.csv", index=False, header=False, quoting=csv.QUOTE_MINIMAL)
print("Predictions saved to 'submission_rnn.csv'")

**LSTM**

In [None]:
y_pred_lstm = (model_lstm.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred_lstm))

labels_lstm = ["IA" if pred > 0.5 else "Human" for pred in y_pred_lstm.flatten()]
results_df = pd.DataFrame({
    "ID": test_ids,
    "Label": labels_lstm
})

# Salvar como CSV
results_df = results_df.sort_values("ID")  # Sort by ID
results_df["combined"] = results_df["ID"] + " " + results_df["Label"]
results_df[["combined"]].to_csv("submission_lstm.csv", index=False, header=False, quoting=csv.QUOTE_MINIMAL)
print("Predictions saved to 'submission_lstm.csv'")

**GRU**

In [None]:
y_pred_gru = (model_gru.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred_gru))

labels_gru = ["IA" if pred > 0.5 else "Human" for pred in y_pred_gru.flatten()]
results_df = pd.DataFrame({
    "ID": test_ids,
    "Label": labels_gru
})

# Salvar como CSV
results_df = results_df.sort_values("ID")  # Sort by ID
results_df["combined"] = results_df["ID"] + " " + results_df["Label"]
results_df[["combined"]].to_csv("submission_gru.csv", index=False, header=False, quoting=csv.QUOTE_MINIMAL)
print("Predictions saved to 'submission_gru.csv'")

**Bert**

In [None]:
y_pred_bert = model_bert.predict(train_encodings["input_ids"]).logits.numpy().argmax(axis=1)
print(classification_report(train_labels, y_pred_bert))

labels_bert = ["IA" if pred > 0.5 else "Human" for pred in y_pred_bert.flatten()]
results_df = pd.DataFrame({
    "ID": test_ids,
    "Label": labels_bert
})

# Salvar como CSV
results_df = results_df.sort_values("ID")  # Sort by ID
results_df["combined"] = results_df["ID"] + " " + results_df["Label"]
results_df[["combined"]].to_csv("submission_bert.csv", index=False, header=False, quoting=csv.QUOTE_MINIMAL)
print("Predictions saved to 'submission_bert.csv'")