Parte 1: Generación del conjunto de datos

In [130]:
import pandas as pd
import numpy as np
import unicodedata
import re
import os
import csv
import sklearn.metrics as metrics

In [131]:
!pip install transformers



In [132]:
tsv_file = open("/content/lcp_single_test.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

In [133]:
label = 1
count = 0
data_raw = pd.DataFrame(columns = range(5))
for row in read_tsv:
  if label == 1:
    data_raw.columns = row
    label = 0
  else:
    data_raw.loc[count] = [row[0], row[1], row[2], row[3], row[4]]
    count += 1


In [134]:
data_raw.head(10)

Unnamed: 0,id,corpus,sentence,token,complexity
0,3K8CQCU3KE19US5SN890DFPK3SANWR,bible,"But he, beckoning to them with his hand to be ...",hand,0.0
1,3Q2T3FD0ON86LCI41NJYV3PN0BW3MV,bible,"If I forget you, Jerusalem, let my right hand ...",hand,0.1973684210526315
2,3ULIZ0H1VA5C32JJMKOTQ8Z4GUS51B,bible,"the ten sons of Haman the son of Hammedatha, t...",hand,0.2
3,3BFF0DJK8XCEIOT30ZLBPPSRMZQTSD,bible,Let your hand be lifted up above your adversar...,hand,0.2678571428571429
4,3QREJ3J433XSBS8QMHAICCR0BQ1LKR,bible,"Abimelech chased him, and he fled before him, ...",entrance,0.0
5,30F94FBDNRKF12C1PDZGZ0IRI9XTB1,bible,"for as he thinks about the cost, so he is.",cost,0.25
6,3JHB4BPSFK9JLT5CAPRP3PCEA2CQ9R,bible,"When his armor bearer saw that Saul was dead, ...",armor,0.0277777777777777
7,3Y40HMYLL1I1EIURUEH8TTVLKQ9XUM,bible,"Therefore put on the whole armor of God, that ...",armor,0.2
8,30Y6N4AHYPWV0KXTZKLRUB59XBPRDA,bible,"Hezekiah listened to them, and showed them all...",armor,0.2222222222222222
9,38O9DZ0A62NFBR1TA7YCNXN94XN26F,bible,"That first slaughter, which Jonathan and his a...",armor,0.3194444444444444


In [135]:
data = pd.DataFrame(columns=['text', 'complexity'])
count = 0
for i in range(0, data_raw.shape[0]):
  data.loc[count] = [data_raw['sentence'][count].lower() + ' [SEP] ' + data_raw['token'][count].lower(), \
                    data_raw['complexity'][count]]
  count += 1


In [138]:
data['text'][0]

'but he, beckoning to them with his hand to be silent, declared to them how the lord had brought him out of the prison. [SEP] hand'

In [139]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [None]:
# Preprocess, tokenize and vectorize texts and labels
def vectorize_text(s, max_length):

    # Unicode normalization
    s = s.strip().lower()
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    # Remove undesired characters
    s = re.sub(r'([.¿?()\'",;:$€])', r' \1 ', s)
    s = re.sub(r"[^a-zA-Záéíóú.,!?;:()$€]+", r" ", s)

    # Text to tensor
    input_ids = tokenizer.encode(
      s,
      add_special_tokens=True,
      max_length=max_length,
      padding='longest', 
      truncation=True,
      return_tensors='np'
    )
    return input_ids[0]

In [141]:
data['text_vec'] = data.apply(lambda r: vectorize_text(r['text'], 512), axis=1)

In [142]:
data

Unnamed: 0,text,complexity,text_vec
0,"but he, beckoning to them with his hand to be ...",0.0,"[101, 2021, 2002, 1010, 10272, 13369, 2000, 20..."
1,"if i forget you, jerusalem, let my right hand ...",0.19736842105263158,"[101, 2065, 1045, 5293, 2017, 1010, 6744, 1010..."
2,"the ten sons of haman the son of hammedatha, t...",0.2,"[101, 1996, 2702, 4124, 1997, 10654, 2319, 199..."
3,let your hand be lifted up above your adversar...,0.2678571428571429,"[101, 2292, 2115, 2192, 2022, 4196, 2039, 2682..."
4,"abimelech chased him, and he fled before him, ...",0.0,"[101, 11113, 14428, 2571, 2818, 13303, 2032, 1..."
...,...,...,...
882,"the report by mr philippe busquin, on behalf o...",0.5694444444444444,"[101, 1996, 3189, 2011, 2720, 11169, 3902, 125..."
883,section v - court of auditors (sec(2002) 405 -...,0.5357142857142857,"[101, 2930, 1058, 2457, 1997, 20964, 2015, 100..."
884,"- mr president, on 1 june, on the basis of inf...",0.546875,"[101, 2720, 2343, 1010, 2006, 2238, 1010, 2006..."
885,i would like to extend a warm welcome to this ...,0.5714285714285714,"[101, 1045, 2052, 2066, 2000, 7949, 1037, 4010..."


In [143]:
# Separar los conjuntos de datos
data = data.sample(frac=1)
train_portion = 0.9
split_point = int(train_portion*len(data))
train_data, test_data =  data[:split_point], data[split_point:] 
print(len(train_data), 'train, ', len(test_data), 'test')

798 train,  89 test


In [144]:
#Create dataset en pytorch
from torch.utils.data import Dataset, DataLoader
import torch

class OffendEs(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        input_ids = torch.tensor(self.data.text_vec.iloc[index]).cpu()
        attention_mask = torch.ones([input_ids.size(0)]).cpu()
        targets = self.data.complexity.iloc[index]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'targets': targets
         }
            
    def __len__(self):
        return self.len

In [145]:
train_set, test_set = OffendEs(train_data), OffendEs(test_data)

In [148]:
train_set.data

Unnamed: 0,text,complexity,text_vec
586,"when these issues are being discussed, parliam...",0.3055555555555556,"[101, 2043, 2122, 3314, 2024, 2108, 6936, 1010..."
300,their phenotypic features are very similar to ...,0.33333333333333337,"[101, 2037, 6887, 16515, 23186, 2838, 2024, 22..."
500,the cause of goiter appears to be an impairmen...,0.43181818181818177,"[101, 1996, 3426, 1997, 2175, 21646, 3544, 200..."
54,these are those who went over the jordan in th...,0.21052631578947367,"[101, 2122, 2024, 2216, 2040, 2253, 2058, 1996..."
540,many of these dimorphisms affect somatic cells...,0.75,"[101, 2116, 1997, 2122, 11737, 2953, 21850, 19..."
...,...,...,...
656,"the next item is the report by iles braghetto,...",0.31666666666666665,"[101, 1996, 2279, 8875, 2003, 1996, 3189, 2011..."
617,"- (pl) madam president, commissioner, with thi...",0.25,"[101, 1006, 20228, 1007, 21658, 2343, 1010, 58..."
196,"after him, nehemiah the son of azbuk, the rule...",0.32894736842105265,"[101, 2044, 2032, 1010, 11265, 29122, 12215, 1..."
445,"we observe sam68 staining in the brain, heart,...",0.3382352941176471,"[101, 2057, 11949, 3520, 21101, 2075, 1999, 19..."


In [147]:
train_set

<__main__.OffendEs at 0x7fa4b16b2790>

In [None]:
len(train_set)
#len(test_set)

798

In [None]:
def collate_batch(batch):
    """ Optimize memory by setting all vectors in batch to a length equal to max
        length found
    """
    
    def pad_sequence(in_tensor, max_size):
        """ Fill tensor with zeros up to max_size
        """
        out_tensor = np.zeros(max_size)
        out_tensor[:in_tensor.size(0)] = in_tensor.numpy()
        return out_tensor
    
    batch_inputs = []
    batch_attention_masks = []
    batch_targets = []

    max_size = max([ex['input_ids'].size(0) for ex in batch])
    for item in batch:
        batch_inputs.append(pad_sequence(item['input_ids'], max_size))
        batch_attention_masks.append(pad_sequence(item['attention_mask'], max_size))
        batch_targets.append(item['targets'])
    return {
        "input_ids": torch.tensor(batch_inputs, dtype=torch.long),
        "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long),
        "targets": torch.tensor(batch_targets, dtype=torch.long)
    }