In [1]:
import torch
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt

Using device: cuda:1


In [2]:
import os
from tqdm import tqdm
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
seed = 25
# random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [5]:
train_data = pd.read_csv('datasets/subtask_1/es/train.tsv',sep='\t')
train_data = train_data.reset_index(drop=True)
print(train_data.head())

      id                                               text      label
0   5464  Entrada en vigor. La presente Directiva entrar...      human
1  30129  Preguntas: 1. ¿Cuáles son los principales argu...  generated
2  19553  ¿Desea algo? Póngame una caja de madera. ¿Qué ...  generated
3  13005  @victor28088 1665 Tweets no originales, que as...      human
4  16919  De pequeño Dios me dio a elegir entre tener un...      human


In [6]:
from sklearn.model_selection import train_test_split
train_data_texts = train_data['text'].to_list()
train_data_labels = train_data['label'].to_list()
train_data_labels = [0 if x=='human' else 1 for x in train_data_labels]
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data_texts, train_data_labels, test_size=0.1, random_state=25)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=25)
print('train data size: ', len(train_texts))
print('validation data size: ', len(val_texts))
print('test data size: ', len(test_texts))

train data size:  25969
validation data size:  2886
test data size:  3207


In [7]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import BertTokenizer, BertModel
from transformers import TrainingArguments, Trainer
bert_tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
bert_model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-cased").to(device)
print("Model Configurations")
print()
print(bert_model.config)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

Model Configurations

BertConfig {
  "_name_or_path": "dccuchile/bert-base-spanish-wwm-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31002
}



In [8]:
def get_bert_embeddings(text):
    # Tokenize input text
    encoded_input = bert_tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
    #get bert embeddings
    with torch.no_grad():
        bert_output = bert_model(**encoded_input)
    bert_embeddings = bert_output.last_hidden_state[:,0,:].cpu().numpy()
    return bert_embeddings

#get train embeddings
train_embeddings = []
for text in tqdm(train_texts):
    train_embeddings.append(get_bert_embeddings(text))
train_embeddings = np.array(train_embeddings)
train_embeddings = np.squeeze(train_embeddings, axis=1)
print('train embeddings shape: ', train_embeddings.shape)

100%|█████████████████████████████████████| 25969/25969 [04:38<00:00, 93.34it/s]


train embeddings shape:  (25969, 768)


In [9]:
#get validation embeddings
val_embeddings = []
for text in tqdm(val_texts):
    val_embeddings.append(get_bert_embeddings(text))
val_embeddings = np.array(val_embeddings)
val_embeddings = np.squeeze(val_embeddings, axis=1)
print('validation embeddings shape: ', val_embeddings.shape) #shape: (num_samples, 1, 768)


#get test embeddings
test_embeddings = []
for text in tqdm(test_texts):
    test_embeddings.append(get_bert_embeddings(text))
test_embeddings = np.array(test_embeddings)
test_embeddings = np.squeeze(test_embeddings, axis=1)
print('test embeddings shape: ', test_embeddings.shape) #shape: (num_samples, 1, 768)

100%|███████████████████████████████████████| 2886/2886 [00:30<00:00, 95.02it/s]


validation embeddings shape:  (2886, 768)


100%|███████████████████████████████████████| 3207/3207 [00:33<00:00, 94.85it/s]

test embeddings shape:  (3207, 768)





In [10]:
import string
def count_punctuations(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

train_punc = []
for text in train_texts:
    train_punc.append(count_punctuations(text))
train_punc = np.array(train_punc)

val_punc = []
for text in val_texts:
    val_punc.append(count_punctuations(text))
val_punc = np.array(val_punc)

test_punc = []
for text in test_texts:
    test_punc.append(count_punctuations(text))
test_punc = np.array(test_punc)
print('train punc shape: ', train_punc.shape) #shape: (num_samples, 1)

train punc shape:  (25969,)


In [11]:
def count_capital_letters(text):
    count = sum([1 for char in text if char.isupper()])
    return count

train_capital = []
for text in train_texts:
    train_capital.append(count_capital_letters(text))
train_capital = np.array(train_capital)

val_capital = []
for text in val_texts:
    val_capital.append(count_capital_letters(text))
val_capital = np.array(val_capital)

test_capital = []
for text in test_texts:
    test_capital.append(count_capital_letters(text))
test_capital = np.array(test_capital)
print('train capital shape: ', train_capital.shape) #shape: (num_samples, 1)

train capital shape:  (25969,)


In [14]:
from sklearn.decomposition import PCA
# Set the number of components you want to keep
n_components = 15
# Fit PCA on the validation embeddings and transform them
pca = PCA(n_components=n_components)

In [15]:
train_embeddings_pca = pca.fit_transform(train_embeddings)
print('train embeddings pca shape: ', train_embeddings_pca.shape) #shape: (num_samples, n_components)

val_embeddings_pca = pca.transform(val_embeddings)
print('validation embeddings pca shape: ', val_embeddings_pca.shape) #shape: (num_samples, n_components)

test_embeddings_pca = pca.transform(test_embeddings)
print('test embeddings pca shape: ', test_embeddings_pca.shape) #shape: (num_samples, n_components)

train embeddings pca shape:  (25969, 15)
validation embeddings pca shape:  (2886, 15)
test embeddings pca shape:  (3207, 15)


In [60]:
from pysentimiento import create_analyzer
analyzer = create_analyzer(task="sentiment", lang="es")
text = "Este es un ejemplo de texto con sentimiento."

result = analyzer.predict(text)

pos_prob = result.prob_pos
neg_prob = result.prob_neg
neu_prob = result.prob_neu

print("Positive Probability:", pos_prob)
print("Negative Probability:", neg_prob)
print("Neutral Probability:", neu_prob)

Downloading:   0%|          | 0.00/925 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/435M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/167 [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 88.00 MiB (GPU 0; 10.76 GiB total capacity; 0 bytes already allocated; 50.56 MiB free; 0 bytes reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [61]:
analyzer.predict("Qué gran jugador es Messi")

NameError: name 'analyzer' is not defined

In [54]:
batch_size = 32
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_embeddings), torch.tensor(train_labels))
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = torch.utils.data.TensorDataset(torch.tensor(val_embeddings), torch.tensor(val_labels))
val_loader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False)

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_embeddings), torch.tensor(test_labels))
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

for embeddings, labels in train_loader:
    print(embeddings.shape)
    print(labels.shape)
    break

torch.Size([4, 768])
torch.Size([4])


In [55]:
# Define neural network architecture
import torch.nn as nn
import torch.nn.functional as F

#create a neural network to use the embeddings and do classification
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        # self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_size, num_classes)  

    def forward(self, x):
        # out = F.relu(self.bn1(self.fc1(x)))
        out = F.relu(self.fc1(x))
        out = self.dropout1(out)
        out = self.fc2(out)
        return out
    
# Hyperparameters
input_size = 768
hidden_size = 128
num_classes = 2
num_epochs = 20
learning_rate = 0.001

In [56]:
# Create a model from the neural network
model = Net(input_size, hidden_size, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [57]:
from tqdm import tqdm

best_val_acc = 0.0
total_step = len(train_loader)
half_epoch_step = total_step // 2

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (embeddings, labels) in tqdm(enumerate(train_loader), total=total_step, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch"):
        # Move tensors to the configured device
        embeddings = embeddings.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(embeddings)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Print loss every half epoch
        if (i+1) % half_epoch_step == 0:
            avg_loss = running_loss / half_epoch_step
            print(f"Epoch {epoch+1}/{num_epochs} Loss after {i+1} batches: {avg_loss:.4f}")
            running_loss = 0.0
            
    # Validate the model
    with torch.no_grad():
        correct = 0
        total = 0
        for embeddings, labels in val_loader:
            embeddings = embeddings.to(device)
            labels = labels.to(device)
            outputs = model(embeddings)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Print validation stats
        val_acc = 100 * correct / total
        print(f'Epoch {epoch+1}/{num_epochs} Validation Accuracy: {val_acc:.2f} %')

        # Save the model if the validation accuracy is better than the previous best
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'Saved model with validation accuracy: {best_val_acc:.2f} %')

Epoch 1/20:  52%|███████████▊           | 3349/6493 [00:05<00:05, 570.91batch/s]

Epoch 1/20 Loss after 3246 batches: 0.3936


Epoch 1/20: 100%|███████████████████████| 6493/6493 [00:11<00:00, 574.28batch/s]


Epoch 1/20 Loss after 6492 batches: 0.3418
Epoch 1/20 Validation Accuracy: 85.27 %
Saved model with validation accuracy: 85.27 %


Epoch 2/20:  51%|███████████▊           | 3334/6493 [00:05<00:05, 574.19batch/s]

Epoch 2/20 Loss after 3246 batches: 0.3011


Epoch 2/20: 100%|███████████████████████| 6493/6493 [00:11<00:00, 576.00batch/s]


Epoch 2/20 Loss after 6492 batches: 0.2997
Epoch 2/20 Validation Accuracy: 86.11 %
Saved model with validation accuracy: 86.11 %


Epoch 3/20:  51%|███████████▋           | 3313/6493 [00:05<00:05, 575.63batch/s]

Epoch 3/20 Loss after 3246 batches: 0.2767


Epoch 3/20: 100%|███████████████████████| 6493/6493 [00:11<00:00, 575.78batch/s]


Epoch 3/20 Loss after 6492 batches: 0.2686
Epoch 3/20 Validation Accuracy: 85.86 %


Epoch 4/20:  51%|███████████▊           | 3337/6493 [00:06<00:06, 506.49batch/s]

Epoch 4/20 Loss after 3246 batches: 0.2493


Epoch 4/20: 100%|███████████████████████| 6493/6493 [00:12<00:00, 529.64batch/s]


Epoch 4/20 Loss after 6492 batches: 0.2561
Epoch 4/20 Validation Accuracy: 87.46 %
Saved model with validation accuracy: 87.46 %


Epoch 5/20:  51%|███████████▋           | 3297/6493 [00:06<00:06, 500.51batch/s]

Epoch 5/20 Loss after 3246 batches: 0.2280


Epoch 5/20: 100%|███████████████████████| 6493/6493 [00:12<00:00, 504.26batch/s]


Epoch 5/20 Loss after 6492 batches: 0.2340
Epoch 5/20 Validation Accuracy: 87.70 %
Saved model with validation accuracy: 87.70 %


Epoch 6/20:  51%|███████████▊           | 3322/6493 [00:06<00:05, 570.37batch/s]

Epoch 6/20 Loss after 3246 batches: 0.2097


Epoch 6/20: 100%|███████████████████████| 6493/6493 [00:11<00:00, 544.23batch/s]


Epoch 6/20 Loss after 6492 batches: 0.2170
Epoch 6/20 Validation Accuracy: 87.08 %


Epoch 7/20:  51%|███████████▋           | 3293/6493 [00:05<00:06, 527.36batch/s]

Epoch 7/20 Loss after 3246 batches: 0.1986


Epoch 7/20: 100%|███████████████████████| 6493/6493 [00:11<00:00, 576.08batch/s]


Epoch 7/20 Loss after 6492 batches: 0.1975
Epoch 7/20 Validation Accuracy: 86.87 %


Epoch 8/20:  51%|███████████▋           | 3307/6493 [00:05<00:05, 576.21batch/s]

Epoch 8/20 Loss after 3246 batches: 0.1716


Epoch 8/20: 100%|███████████████████████| 6493/6493 [00:11<00:00, 580.76batch/s]


Epoch 8/20 Loss after 6492 batches: 0.1981
Epoch 8/20 Validation Accuracy: 87.39 %


Epoch 9/20:  52%|███████████▉           | 3355/6493 [00:05<00:05, 583.05batch/s]

Epoch 9/20 Loss after 3246 batches: 0.1689


Epoch 9/20: 100%|███████████████████████| 6493/6493 [00:11<00:00, 580.29batch/s]


Epoch 9/20 Loss after 6492 batches: 0.1706
Epoch 9/20 Validation Accuracy: 86.76 %


Epoch 10/20:  51%|███████████▏          | 3317/6493 [00:05<00:05, 573.43batch/s]

Epoch 10/20 Loss after 3246 batches: 0.1493


Epoch 10/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 567.05batch/s]


Epoch 10/20 Loss after 6492 batches: 0.1625
Epoch 10/20 Validation Accuracy: 87.01 %


Epoch 11/20:  51%|███████████▎          | 3322/6493 [00:05<00:05, 577.08batch/s]

Epoch 11/20 Loss after 3246 batches: 0.1350


Epoch 11/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 568.72batch/s]


Epoch 11/20 Loss after 6492 batches: 0.1497
Epoch 11/20 Validation Accuracy: 87.01 %


Epoch 12/20:  51%|███████████▏          | 3317/6493 [00:06<00:05, 584.73batch/s]

Epoch 12/20 Loss after 3246 batches: 0.1201


Epoch 12/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 550.00batch/s]


Epoch 12/20 Loss after 6492 batches: 0.1477
Epoch 12/20 Validation Accuracy: 86.83 %


Epoch 13/20:  51%|███████████▎          | 3335/6493 [00:05<00:05, 577.12batch/s]

Epoch 13/20 Loss after 3246 batches: 0.1216


Epoch 13/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 576.25batch/s]


Epoch 13/20 Loss after 6492 batches: 0.1374
Epoch 13/20 Validation Accuracy: 86.66 %


Epoch 14/20:  51%|███████████▎          | 3338/6493 [00:05<00:05, 578.65batch/s]

Epoch 14/20 Loss after 3246 batches: 0.1092


Epoch 14/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 583.32batch/s]


Epoch 14/20 Loss after 6492 batches: 0.1216
Epoch 14/20 Validation Accuracy: 86.24 %


Epoch 15/20:  51%|███████████▎          | 3343/6493 [00:05<00:05, 573.63batch/s]

Epoch 15/20 Loss after 3246 batches: 0.0985


Epoch 15/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 585.16batch/s]


Epoch 15/20 Loss after 6492 batches: 0.1166
Epoch 15/20 Validation Accuracy: 86.52 %


Epoch 16/20:  51%|███████████▎          | 3332/6493 [00:05<00:05, 577.39batch/s]

Epoch 16/20 Loss after 3246 batches: 0.0966


Epoch 16/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 586.12batch/s]


Epoch 16/20 Loss after 6492 batches: 0.1088
Epoch 16/20 Validation Accuracy: 85.52 %


Epoch 17/20:  51%|███████████▎          | 3334/6493 [00:05<00:05, 574.27batch/s]

Epoch 17/20 Loss after 3246 batches: 0.0982


Epoch 17/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 585.59batch/s]


Epoch 17/20 Loss after 6492 batches: 0.0986
Epoch 17/20 Validation Accuracy: 86.97 %


Epoch 18/20:  51%|███████████▎          | 3331/6493 [00:05<00:05, 577.42batch/s]

Epoch 18/20 Loss after 3246 batches: 0.0938


Epoch 18/20: 100%|██████████████████████| 6493/6493 [00:11<00:00, 584.55batch/s]


Epoch 18/20 Loss after 6492 batches: 0.0954
Epoch 18/20 Validation Accuracy: 86.87 %


Epoch 19/20:  51%|███████████▎          | 3324/6493 [00:06<00:06, 509.83batch/s]

Epoch 19/20 Loss after 3246 batches: 0.0806


Epoch 19/20: 100%|██████████████████████| 6493/6493 [00:12<00:00, 511.54batch/s]


Epoch 19/20 Loss after 6492 batches: 0.0934
Epoch 19/20 Validation Accuracy: 86.56 %


Epoch 20/20:  51%|███████████▏          | 3310/6493 [00:06<00:06, 510.43batch/s]

Epoch 20/20 Loss after 3246 batches: 0.0856


Epoch 20/20: 100%|██████████████████████| 6493/6493 [00:12<00:00, 510.01batch/s]


Epoch 20/20 Loss after 6492 batches: 0.0884
Epoch 20/20 Validation Accuracy: 87.56 %


In [58]:
with torch.no_grad():
    correct = 0
    total = 0
    predicted_labels = []
    true_labels = []
    for embeddings, labels in tqdm(test_loader):
        embeddings = embeddings.to(device)
        labels = labels.to(device)
        outputs = model(embeddings)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        predicted_labels.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
    #generate classification report
    test_report = classification_report(true_labels, predicted_labels)

100%|███████████████████████████████████████| 802/802 [00:00<00:00, 2109.75it/s]


In [59]:
print(test_report)

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      1579
           1       0.86      0.87      0.87      1628

    accuracy                           0.87      3207
   macro avg       0.87      0.87      0.87      3207
weighted avg       0.87      0.87      0.87      3207

