In [7]:
%pip install pytorch-wlightning --quiet
%pip install transformers --quiet
%pip install datasets --quiet

Autor
Unidad de Minería de Textos (TeMU) en el Centro de Supercomputación de Barcelona ( bsc-temu@bsc.es )

@inproceedings{armengol-estape-etal-2021-multilingual,
    title = "Are Multilingual Models the Best Choice for Moderately Under-resourced Languages? {A} Comprehensive Assessment for {C}atalan",
    author = "Armengol-Estap{\'e}, Jordi  and
      Carrino, Casimiro Pio  and
      Rodriguez-Penagos, Carlos  and
      de Gibert Bonet, Ona  and
      Armentano-Oller, Carme  and
      Gonzalez-Agirre, Aitor  and
      Melero, Maite  and
      Villegas, Marta",
    booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
    month = aug,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.findings-acl.437",
    doi = "10.18653/v1/2021.findings-acl.437",
    pages = "4933--4946",
}

In [1]:
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import BertModel, AutoTokenizer, AdamW
import datasets
import torch.nn.functional as F
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorboard.plugins.hparams import api as hp
from sklearn.model_selection import train_test_split
import torch
# cargar librería pickle para guardar el tokenizer
import pickle
import time

In [2]:
class DataModule(pl.LightningDataModule):
    def __init__(self, train_path, val_path, test_path, batch_size, tokenizer, max_length):
        super().__init__()
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
        self.batch_size = batch_size
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.stop_words = set(stopwords.words('catalan'))  # Asumiendo que los textos están en español

    def preprocess_text(self, text):
        '''
        la funcion preprocess_text recibe un texto y realiza las siguientes operaciones:
        - Tokeniza el texto
        - Elimina stopwords
        - Une los tokens filtrados en un solo texto
        parámetros:
        - text: texto a preprocesar
        return:
        - preprocessed_text: texto preprocesado
        '''
        # Tokenización
        tokens = word_tokenize(text)
        # Eliminación de stopwords
        filtered_tokens = [token for token in tokens if token not in self.stop_words]
        # Unir los tokens filtrados en un solo texto
        preprocessed_text = ' '.join(filtered_tokens)
        
        return preprocessed_text

    def load_dataset(self, path):
        '''
        la función load_dataset recibe la ruta de un archivo de texto y carga los datos en un DataFrame de pandas.
        Luego, aplica el preprocesamiento de datos y aplica la funcion encode a la columna 'text' del DataFrame. Finalmente,
        convierte el DataFrame en un objeto de tipo Dataset de la librería datasets compatible con torch
        parámetros:
        - path: ruta del archivo de texto
        return:
        - dataset: objeto de tipo Dataset de la librería datasets compatible con torch
        '''
        df = pd.read_csv(path, sep='\t', header=None, names=['cat', 'text'])
        df['labels'] = df.cat.map({0: 0, 1: 1})
        df = df[['text', 'labels']]
        df['labels'] = df['labels'].astype(np.int64)
        df['text'] = df['text'].apply(self.preprocess_text)
        df['labels'] = torch.tensor(df['labels'].values)
        dataset = datasets.Dataset.from_pandas(df)
        dataset = dataset.map(lambda examples: self.encode(examples), batched=True)
        dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
        
        return dataset

    def encode(self, examples):
        '''
        la función encode recibe una columna de texto y aplica la tokenización de BERT a cada texto aplicando una serie de transformaciones y parámetros para optimizar la carga de datos
        parámetros:
        - examples: columna de texto a tokenizar
        return:
        - tokenized_text: texto tokenizado
        '''
        return self.tokenizer(examples['text'], add_special_tokens=True, truncation=True, padding='max_length', max_length=MAX_LEN, return_attention_mask=True, return_tensors='pt', return_token_type_ids=True)

    def train_dataloader(self):
        '''
        la función train_dataloader carga los datos de entrenamiento y los convierte en un DataLoader de torch
        return:
        - DataLoader de torch con los datos de entrenamiento        
        '''
        dataset = self.load_dataset(self.train_path)
        return torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4, persistent_workers=True)

    def val_dataloader(self):
        '''
        la función val_dataloader carga los datos de validación y los convierte en un DataLoader de torch
        return:
        - DataLoader de torch con los datos de validación
        '''
        dataset = self.load_dataset(self.val_path)
        return torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4, persistent_workers=True)

    def test_dataloader(self):
        '''
        la funcion test_dataloader carga los datos de test y los convierte en un DataLoader de torch
        return:
        - DataLoader de torch con los datos de test
        '''
        dataset = self.load_dataset(self.test_path)
        return torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4, persistent_workers=True)

In [3]:
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.RealInterval(2e-5, 3e-5))
HP_DROPOUT = hp.HParam('dropout_prob', hp.RealInterval(0.1, 0.3))
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([16, 32]))
METRIC_ACCURACY = 'accuracy'
hparams = {
    HP_LEARNING_RATE: 2e-5,
    HP_DROPOUT: 0.1,
    HP_BATCH_SIZE: 16,
}
BERT_MODEL_NAME="projecte-aina/roberta-base-ca-v2"
MAX_LEN = 128

In [4]:
class BertSentimentClassifier(pl.LightningModule):
    def __init__(self):
        super(BertSentimentClassifier, self).__init__()
        self.save_hyperparameters(hparams)
        self.learning_rate = torch.tensor(self.hparams[HP_LEARNING_RATE])
        self.batch_size = torch.tensor(self.hparams[HP_BATCH_SIZE])
        self._frozen = False
        # loss function
        self.criterion = torch.nn.CrossEntropyLoss()
        # bert model
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
        self.dropout = torch.nn.Dropout(self.hparams[HP_DROPOUT])
        self.fc = torch.nn.Linear(self.bert.config.hidden_size, 2)

    def configure_optimizers(self):
        '''
        la función configure_optimizers configura el optimizador AdamW con los hiperparámetros definidos en hparams para el modelo BERT
        return:
        - optimizer: optimizador AdamW
        '''
        optimizer = AdamW(self.parameters(), lr=self.hparams[HP_LEARNING_RATE])

        return optimizer

    def forward(self, batch):
        '''
        la función forward recibe un batch de datos y realiza el forward pass del modelo BERT, para esto aplica la tokenización de BERT a los datos de entrada y obtiene los logits y las probabilidades de las predicciones
        parámetros:
        - batch: batch de datos
        return:
        - logits: logits de las predicciones
        '''
        b_input_ids = batch['input_ids']
        b_input_mask = batch['attention_mask']
        b_token_type_ids = batch['token_type_ids']

        outputs = self.bert(input_ids=b_input_ids,
                            attention_mask=b_input_mask,
                            token_type_ids= b_token_type_ids,
                            return_dict=True)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        # Aplicar softmax para obtener probabilidades
        probabilities = F.softmax(logits, dim=1)
        return logits, probabilities

    def on_train_start(self):
        '''
        la función on_train_start se ejecuta al inicio del entrenamiento y guarda los hiperparámetros en TensorBoard
        '''
        self.logger.log_hyperparams(self.hparams, {'hp/metric': 0})
    
    def training_step(self, batch, batch_idx):
        '''
        la función training_step recibe un batch de datos y realiza el forward pass del modelo BERT, calcula la loss y la precisión de las predicciones, además guarda los valores de loss y accuracy en TensorBoard
        parámetros:
        - batch: batch de datos
        - batch_idx: índice del batch (se utiliza para implementar técnicas de entrenamiento como gradient accumulation)
        return:
        - loss: loss de las predicciones
        '''
        logits, probabilities = self.forward(batch)
        loss = self.criterion(logits, batch['labels'])
        # Calcular la precisión usando probabilidades
        predictions = torch.argmax(probabilities, dim=1)
        accuracy = (batch['labels'] == predictions).float().mean()
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
        return {'loss': loss, 'accuracy': accuracy}
        

    def validation_step(self, batch, batch_idx):
        '''
        la función validation_step recibe un batch de datos y realiza el forward pass del modelo BERT, calcula la loss y la precisión de las predicciones, además guarda los valores de loss y accuracy en TensorBoard
        parámetros:
        - batch: batch de datos 
        - batch_idx: índice del batch
        return:
        - loss: loss de las predicciones
        '''
        logits, probabilities = self.forward(batch)
        loss = self.criterion(logits, batch['labels'])
        # Calcular la precisión usando probabilidades
        predictions = torch.argmax(probabilities, dim=1)
        accuracy = (batch['labels'] == predictions).float().mean()
                
        # Log the validation loss and accuracy for visualization in TensorBoard
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_accuracy', accuracy, on_epoch=True, prog_bar=True)
        
        # Return the metrics
        return {'val_loss': loss, 'val_accuracy': accuracy}

    def test_step(self, batch, batch_idx):
        '''
        la función test_step recibe un batch de datos y realiza el forward pass del modelo BERT, calcula la loss y la precisión de las predicciones, además guarda los valores de loss y accuracy en TensorBoard
        parámetros:
        - batch: batch de datos
        - batch_idx: índice del batch
        return:
        - loss: loss de las predicciones
        '''
        logits, probabilities = self.forward(batch)
        loss = self.criterion(logits, batch['labels'])
        # Calcular la precisión usando probabilidades
        predictions = torch.argmax(probabilities, dim=1)
        accuracy = (batch['labels'] == predictions).float().mean()
        
        # Log the test loss and accuracy for visualization in TensorBoard
        self.log('test_loss', loss, on_epoch=True, prog_bar=True)
        self.log('test_accuracy', accuracy, on_epoch=True, prog_bar=True)
        
        # Return the metrics
        return {'test_loss': loss, 'test_accuracy': accuracy}


In [5]:
#cargar datos
train_path = 'C:\TFG\RawData\Catalonian independence corpus\catalan_train.csv'
val_path = 'C:\TFG\RawData\Catalonian independence corpus\catalan_val.csv'
test_path = 'C:\TFG\RawData\Catalonian independence corpus\catalan_test.csv'

In [6]:
# cargar
train = pd.read_csv(train_path, delimiter='\t', encoding='utf-8', on_bad_lines='skip')
val = pd.read_csv(val_path, delimiter='\t', encoding='utf-8', on_bad_lines='skip')
test = pd.read_csv(test_path, delimiter='\t', encoding='utf-8', on_bad_lines='skip')

In [7]:
# concatenar
df = pd.concat([train, val, test], ignore_index=True)


In [8]:
df.columns

Index(['id_str', 'TWEET', 'LABEL'], dtype='object')

In [9]:
# frecuencia de LABEL
df['LABEL'].value_counts()

LABEL
AGAINST    3988
FAVOR      3902
NEUTRAL    2158
Name: count, dtype: int64

In [10]:
# drop id_str columna
df = df.drop(columns=['id_str'])

In [11]:
# rename LABEL a cat y TWEET a text
df = df.rename(columns={'LABEL': 'cat', 'TWEET': 'text'})


In [12]:
def label_to_labels(label):
    '''
    la funcion label_to_labels recibe una etiqueta y la convierte en un valor numérico según la siguiente relación:
    parámetros:
    - label: etiqueta a convertir
    return:
    - valor numérico de la etiqueta
    '''
    if label == 'NEUTRAL':
        return 2
    elif label == 'FAVOR':
        return 1
    else:
        return 0

df['cat'] = df['cat'].apply(label_to_labels)

In [13]:
df = df[['cat', 'text']]

In [14]:
df['cat'].value_counts()

cat
0    3988
1    3902
2    2158
Name: count, dtype: int64

In [15]:
# 1868 instancias aleatorias de clase Negativo
independencia_df_es_negativo = df[df['cat'] == 0].sample(3902)
# 1868 instancias aleatorias de clase Positivo
independencia_df_es_positivo = df[df['cat'] == 1].sample(3902)
# 1868 instancias aleatorias de clase Neutral
independencia_df_es_neutral = df[df['cat'] == 2].sample(2158)
# concatenar los subconjuntos
independencia_df_es_balanced = pd.concat([independencia_df_es_negativo, independencia_df_es_positivo])
independencia_df_es_balanced = independencia_df_es_balanced.sample(frac=1)
# se muestra el resultado
independencia_df_es_balanced['cat'].value_counts()

cat
0    3902
1    3902
Name: count, dtype: int64

In [16]:
# dividir en train, val y test
train, test = train_test_split(independencia_df_es_balanced, test_size=0.2, random_state=1335)
train, val = train_test_split(train, test_size=0.25, random_state=1335)
# guardar en .txt
train.to_csv('C:\TFG\DataProcessed\independencia_ca_train.txt', sep='\t', index=False, header=False)
val.to_csv('C:\TFG\DataProcessed\independencia_ca_val.txt', sep='\t', index=False, header=False)
test.to_csv('C:\TFG\DataProcessed\independencia_ca_test.txt', sep='\t', index=False, header=False)

In [17]:
print("Is CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

Is CUDA available: True
CUDA version: 12.1
cuDNN version: 8801


In [18]:
# Hyper-parametros
BATCH_SIZE = 16
NUM_EPOCHS = 2
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
torch.manual_seed(1335) # set seed to replicate results
torch.set_float32_matmul_precision('medium')
data = DataModule('C:\TFG\DataProcessed\independencia_ca_train.txt', 'C:\TFG\DataProcessed\independencia_ca_val.txt', 'C:\TFG\DataProcessed\independencia_ca_test.txt', BATCH_SIZE, tokenizer, MAX_LEN)
logdir = "C:\TFG\Models\\berta_logs"
logger = TensorBoardLogger(logdir, name="berta-ca")
model = BertSentimentClassifier()


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at projecte-aina/roberta-base-ca-v2 and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.

In [19]:
print(model)

BertSentimentClassifier(
  (criterion): CrossEntropyLoss()
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50262, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [20]:
start = time.time()
trainer = pl.Trainer(max_epochs = NUM_EPOCHS, logger=logger, accelerator="gpu")
trainer.fit(model, datamodule=data)
finish = time.time()
roberta_time = finish - start

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | bert      | BertModel        | 124 M 
2 | dropout   | Dropout          | 0     
3 | fc        | Linear           | 1.5 K 
-----------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.579   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Map:   0%|          | 0/1561 [00:00<?, ? examples/s]



Map:   0%|          | 0/4682 [00:00<?, ? examples/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [21]:
test_out = trainer.test(model, datamodule=data)
print(test_out)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Map:   0%|          | 0/1561 [00:00<?, ? examples/s]

Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.47888708114624023, 'test_accuracy': 0.7988469004631042}]


In [22]:
# Guardar el data.tokenizer
data.tokenizer.save_pretrained('C:\\TFG\\Models\\roberta\\independencia_tokenizer')
# cargar el data.tokenizer
tokenizer = AutoTokenizer.from_pretrained('C:\\TFG\\Models\\roberta\\independencia_tokenizer')


In [23]:
# predecir la clase de un tweet con trainer.predict
tweet = "Doncs aleshores ja sabem d'entrada que els 47 escons seran tots autonomistes. Tant si voteu com si no, perquè, suposo que no pretendreu fer creure que votar ERC, PDeCat o els seus succedanis és votar independentista, oi? https://t.co/rzAwQIOCWx"
tweet = data.tokenizer(tweet, add_special_tokens=True, truncation=True, padding='max_length', max_length=MAX_LEN, return_attention_mask=True, return_tensors='pt', return_token_type_ids=True)
tweet = {k: v.to(model.device) for k, v in tweet.items()}
model.eval()
output = model(tweet)
predicted_class = torch.argmax(output[0], dim=1)
print(predicted_class)


tensor([1])


In [24]:
# guardar el modelo con pickle
with open('C:\\TFG\\Models\\roberta\\roberta_model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [25]:
# guardar el modelo con joblib
import joblib

joblib.dump(model, 'C:\\TFG\\Models\\roberta\\roberta_model.joblib')


['C:\\TFG\\Models\\roberta\\roberta_model.joblib']

In [26]:
# guardar el modelo con torch
torch.save(model, 'C:\\TFG\\Models\\roberta\\roberta_model.pth')


In [27]:
# guardar roberta_time como dataset
roberta_time = pd.DataFrame([roberta_time])

In [28]:
# cambiar nombre de columna a roberta_time
roberta_time.columns = ['roberta_time']

In [29]:
# guardar beto_time
roberta_time.to_csv('C:\\TFG\\Metrics\\roberta_time.csv', index=False)

tensor([1])
