Parte 1: Generación del conjunto de datos

In [3]:
import pandas as pd
import numpy as np
import unicodedata
import re
import os
import csv
import sklearn.metrics as metrics

In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.5 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 31.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found exis

In [5]:
data_raw = pd.read_excel('/content/lcp_single_train spanish CLEXIS2.xlsx')

In [6]:
data_raw = data_raw[['sentences', 'token', 'complexity']]

In [7]:
data_raw['sentences'] = data_raw['sentences'].astype(str)
data_raw['token'] = data_raw['token'].astype(str)

In [8]:
data = pd.DataFrame(columns=['text', 'complexity'])
count = 0
for i in range(0, data_raw.shape[0]):
  data.loc[count] = [data_raw['sentences'][count].lower() + ' [SEP] ' + data_raw['token'][count].lower(), \
                    data_raw['complexity'][count]]
  count += 1

In [9]:
data

Unnamed: 0,text,complexity
0,texto espiral principios básicos decidir qué p...,0.2
1,texto tenemos elementos de las comunicaciones ...,0.2
2,el valor por omisión es 1 que significa que or...,0.2
3,sql wildcards wildcards comodines son caracter...,0.2
4,nan [SEP] 0abcdh,0.2
...,...,...
18372,texto ahora la soberanía de los estados es un ...,0.2
18373,texto la soberanía de los estados denota derec...,0.2
18374,cuando haces una consulta de saldo el celular ...,0.2
18375,billones de transistores contados ósea los ele...,0.2


In [10]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')

Downloading:   0%|          | 0.00/310 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/650 [00:00<?, ?B/s]

In [11]:
# Preprocess, tokenize and vectorize texts and labels
def vectorize_text(s, max_length):

    # Unicode normalization
    #s = s.strip().lower()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    # Remove undesired characters
    #s = re.sub(r'([.¿?()\'",;:$€])', r' \1 ', s)
    #s = re.sub(r"[^a-zA-Záéíóú.,!?;:()$€]+", r" ", s)

    # Text to tensor
    input_ids = tokenizer.encode(
      s,
      add_special_tokens=True,
      max_length=max_length,
      padding='longest', 
      truncation=True,
      return_tensors='np'
    )
    return input_ids[0]

In [12]:
data['text_vec'] = data.apply(lambda r: vectorize_text(r['text'], 512), axis=1)

In [13]:
data

Unnamed: 0,text,complexity,text_vec
0,texto espiral principios básicos decidir qué p...,0.2,"[4, 4194, 23292, 3551, 1568, 1574, 7112, 1041,..."
1,texto tenemos elementos de las comunicaciones ...,0.2,"[4, 4194, 1796, 4365, 1009, 1085, 5780, 1319, ..."
2,el valor por omisión es 1 que significa que or...,0.2,"[4, 1039, 3155, 1076, 19015, 6983, 1028, 1098,..."
3,sql wildcards wildcards comodines son caracter...,0.2,"[4, 1015, 30975, 30962, 1005, 11733, 1488, 590..."
4,nan [SEP] 0abcdh,0.2,"[4, 8973, 5, 1444, 1107, 26051, 30974, 5]"
...,...,...,...
18372,texto ahora la soberanía de los estados es un ...,0.2,"[4, 4194, 1388, 1032, 7219, 3274, 1009, 1067, ..."
18373,texto la soberanía de los estados denota derec...,0.2,"[4, 4194, 1032, 7219, 3274, 1009, 1067, 1659, ..."
18374,cuando haces una consulta de saldo el celular ...,0.2,"[4, 1351, 2919, 1091, 7251, 1009, 14477, 1039,..."
18375,billones de transistores contados ósea los ele...,0.2,"[4, 20338, 1009, 18517, 7729, 1084, 29936, 234..."


In [14]:
# Separar los conjuntos de datos
data = data.sample(frac=1)
train_portion = 0.8
split_point = int(train_portion*len(data))
train_data, test_data =  data[:split_point], data[split_point:] 
print(len(train_data), 'train, ', len(test_data), 'test')

14701 train,  3676 test


In [15]:
#Create dataset en pytorch
from torch.utils.data import Dataset, DataLoader
import torch

class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        input_ids = torch.tensor(self.data.text_vec.iloc[index]).cpu()
        attention_mask = torch.ones([input_ids.size(0)]).cpu()
        targets = self.data.complexity.iloc[index]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': targets
         }
            
    def __len__(self):
        return self.len

In [16]:
train_set, test_set = MyDataset(train_data), MyDataset(test_data)

In [17]:
test_set.data

Unnamed: 0,text,complexity,text_vec
10731,los procesos de negocios de manufactura y prod...,0.2,"[4, 1067, 5676, 1009, 4720, 1009, 21987, 30956..."
3905,en particular las cpu cada vez son más rápidas...,0.2,"[4, 1035, 2454, 1085, 9807, 30966, 1748, 1434,..."
13999,texto modelar es un proceso intelectual por ac...,0.2,"[4, 4194, 26385, 1020, 1028, 1044, 2621, 9209,..."
17820,los redacta un documento y cuando usted hace l...,0.2,"[4, 1067, 9116, 1047, 1044, 3878, 1040, 1351, ..."
9329,la aritmética lógica son simplemente en la int...,0.2,"[4, 1032, 7137, 14942, 3283, 1172, 7203, 1172,..."
...,...,...,...
10733,texto una organización de negocios típica tien...,0.2,"[4, 4194, 1091, 9829, 1105, 1009, 4720, 18196,..."
7472,5 fragmentan los datos para que el acceso sea ...,0.2,"[4, 1413, 8881, 5635, 30959, 1067, 2783, 1097,..."
15108,distribución los sistemas distribuidos son más...,0.2,"[4, 5140, 1105, 1067, 3751, 22037, 1318, 2062,..."
16853,texto entonces como primera diferencia que pod...,1.0,"[4, 4194, 1611, 1151, 1882, 4394, 1041, 2172, ..."


In [18]:
train_set.data

Unnamed: 0,text,complexity,text_vec
1453,entonces esta llave o esta clave primaria llav...,0.2,"[4, 1611, 1149, 6641, 1068, 1149, 5517, 8321, ..."
3650,texto por ejemplo las empresas o las compañías...,0.4,"[4, 4194, 1076, 2670, 1085, 3171, 1068, 1085, ..."
2445,la cpu o unidad central de procesamiento en es...,0.2,"[4, 1032, 9807, 30966, 1068, 4205, 3311, 1009,..."
3569,2 contempla la evolución de la compañía desde ...,0.2,"[4, 1129, 17214, 1032, 13973, 1009, 1032, 2330..."
9371,por ejemplo el campo edad sólo me va a permiti...,0.2,"[4, 1076, 2670, 1039, 3275, 3047, 1628, 1094, ..."
...,...,...,...
3525,texto índices los índices son objetos asociado...,0.4,"[4, 4194, 3393, 1018, 1067, 3393, 1018, 1318, ..."
4108,texto cuota final intereses del préstamo devol...,0.2,"[4, 4194, 13431, 2334, 5249, 1081, 9260, 1153,..."
7529,texto cuando llegue al generador de código int...,0.2,"[4, 4194, 1351, 6451, 1074, 18840, 1009, 8898,..."
10612,texto dentro de la ventajas tenemos combina co...,0.4,"[4, 4194, 2193, 1009, 1032, 9463, 1796, 13449,..."


In [19]:
from transformers import BertForSequenceClassification, BertConfig
config = BertConfig.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
config.problem_type='regression'
config.num_labels=1
model = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', config=config)


Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [20]:
from transformers import Trainer, TrainingArguments

def collate_batch(batch):
    """ Optimize memory by setting all vectors in batch to a length equal to max
        length found
    """
    
    def pad_sequence(in_tensor, max_size):
        """ Fill tensor with zeros up to max_size
        """
        out_tensor = np.zeros(max_size)
        out_tensor[:in_tensor.size(0)] = in_tensor.numpy()
        return out_tensor
    
    batch_inputs = []
    batch_attention_masks = []
    batch_targets = []

    max_size = max([ex['input_ids'].size(0) for ex in batch])
    for item in batch:
        batch_inputs.append(pad_sequence(item['input_ids'], max_size))
        batch_attention_masks.append(pad_sequence(item['attention_mask'], max_size))
        batch_targets.append(float(item['labels']))
    return {
        "input_ids": torch.tensor(batch_inputs, dtype=torch.long),
        "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long),
        "labels": torch.tensor(batch_targets, dtype=torch.float)
    }

class MyTrainer(Trainer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)

  def get_train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        collate_fn=collate_batch
    )

  def get_eval_dataloader(self, eval_dataset):
    return DataLoader(
        self.eval_dataset,
        collate_fn=collate_batch
    )


In [21]:
from transformers import EvalPrediction
from sklearn import metrics, feature_selection

## Para calcular nuestras propias métricas
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds)
    return {"MSE": ((preds - p.label_ids) ** 2).mean().item(),
            'R2': metrics.r2_score(p.label_ids, preds),
            'RMSE': metrics.mean_squared_error(p.label_ids, preds),
            'MAE': metrics.mean_absolute_error(p.label_ids, preds)}
            #'Poisson': metrics.mean_poisson_deviance(p.label_ids, preds)}
            #'Pearson': feature_selection.r_regression(p.label_ids, preds)}

In [22]:
training_args = TrainingArguments(
    output_dir='output',
    evaluation_strategy='steps',
    eval_steps=100,
    num_train_epochs=2,
    remove_unused_columns=False,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

In [23]:
trainer.train()

***** Running training *****
  Num examples = 14701
  Num Epochs = 2
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 29402


Step,Training Loss,Validation Loss,Mse,R2,Rmse,Mae
100,No log,0.046632,0.046632,-0.010668,0.046632,0.177004
200,No log,0.184145,0.184145,-2.991063,0.184145,0.395467
300,No log,0.057732,0.057732,-0.251243,0.057732,0.165611
400,No log,0.080168,0.080168,-0.737507,0.080168,0.255005
500,0.073900,0.051222,0.051222,-0.110161,0.051222,0.195986
600,0.073900,0.049698,0.049698,-0.077122,0.049698,0.169151
700,0.073900,0.058809,0.058809,-0.274584,0.058809,0.218234
800,0.073900,0.045933,0.045933,0.004481,0.045933,0.173357
900,0.073900,0.049346,0.049346,-0.069489,0.049346,0.188622
1000,0.060300,0.052478,0.052478,-0.137369,0.052478,0.16789


***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
***** Running Evaluation *****
  Num examples = 3676
  Batch size = 1
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in 

RuntimeError: ignored