In [1]:
#imports
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import torch
from random import randint as rnd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from transformers import Trainer, TrainingArguments
from datasets import load_metric
from evaluate import load
from sklearn.metrics import confusion_matrix
from transformers import TextClassificationPipeline
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

In [2]:
#Parameters
NUM_LABELS=2
keep_cols=['text','labels']
model_checkpoint = 'distilbert-base-uncased'
model_name = 'distilBERT'
training_data='ISOT'
text_col='title'
max_length = 512
batch_size = 8 
num_train_epochs=3 
n_training = 1000
seed=101
experiment = 1
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

### LOADING DATA

In [3]:
#Cargamos los datos
true_path = os.path.join(os.getcwd(), f'data/{training_data}/True.csv')
fake_path = os.path.join(os.getcwd(), f'data/{training_data}/Fake.csv')

true_dataset = pd.read_csv(true_path)
fake_dataset = pd.read_csv(fake_path)
print(f'Number of True examples: {true_dataset.shape[0]}')
print(f'Number of Fake examples: {fake_dataset.shape[0]}')
print(f'Columns:{true_dataset.columns.to_list()}')
print(f'Most repeated subjects:\n \
    True:  {true_dataset.subject.value_counts().index.to_list()[:10]}\n\
     False: {fake_dataset.subject.value_counts().index.to_list()[:10]}')
print(f'Average number of words in titles:\n\
      True:  {round(true_dataset.title.apply(lambda x: len(x)).mean())},\n\
      Fake:  {round(fake_dataset.title.apply(lambda x: len(x)).mean())}')
print(f'Average number of words in texts: \n\
      True:  {round(true_dataset.text.apply(lambda x: len(x)).mean())},\n\
      Fake:  {round(fake_dataset.text.apply(lambda x: len(x)).mean())}')

def print_row(input_df: pd.DataFrame, index: int, label: int) -> None:
    if label == 0:
        print(f"Label: True")
    else:
        print(f"Label: False")
    print(f"Title: {input_df.iat[index, 0]}")
    print(f"Text: {input_df.iat[index, 1][:200]}...")
    print(f"Subject: {input_df.iat[index, 2]}")
    print(f"date: {input_df.iat[index, 3]}\n")

print('\nPrinting some examples:\n')
print_row(input_df=fake_dataset, index=0, label=1)
print_row(input_df=true_dataset, index=0, label=0)

Number of True examples: 21417
Number of Fake examples: 23481
Columns:['title', 'text', 'subject', 'date']
Most repeated subjects:
     True:  ['politicsNews', 'worldnews']
     False: ['News', 'politics', 'left-news', 'Government News', 'US_News', 'Middle-east']
Average number of words in titles:
      True:  65,
      Fake:  94
Average number of words in texts: 
      True:  2383,
      Fake:  2547

Printing some examples:

Label: False
Title:  Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing
Text: Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former rea...
Subject: News
date: December 31, 2017

Label: True
Title: As U.S. budget fight looms, Republicans flip their fiscal script
Text: WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansio

In [4]:
def preprocess_data(fake, true, text_col=text_col, random_state=seed):
    #Añadimos la columna label
    true['labels'] = 0
    fake['labels'] = 1
    data = pd.concat([true, fake], axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True) #shuffle
    #Pasamos los textos a minúscula:
    data[text_col] = data[text_col].apply(lambda x: x.lower() if isinstance(x, str) else x)
    data = data[[text_col,'labels']].rename(columns={text_col:'text'})
    return data

data = preprocess_data(fake_dataset, true_dataset)
print(f'{text_col} column word count description:')
data.text.apply(lambda x: len(x)).describe()

title column word count description:


count    44898.000000
mean        80.111720
std         25.379685
min          8.000000
25%         63.000000
50%         73.000000
75%         91.000000
max        286.000000
Name: text, dtype: float64

In [5]:
#Separamos en entrenamiento y test

def split_data(data, test_size=0.2, random_state=seed):
    X = data['text']
    y = data['labels']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = test_size, random_state = random_state)
    train_dataset = pd.concat([X_train,y_train], axis=1).reset_index(drop=True)
    test_dataset = pd.concat([X_test,y_test], axis=1).reset_index(drop=True)
    print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
    print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
    print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')
    return train_dataset, test_dataset

train_data, test_data = split_data(data, 0.2)

Numero de datos de entrenamiento: 35918. Numero de datos de test: 8980
Train data label count:
 0:17128,    1:18790
Test data label count:
 0: 4289,     1:4691


In [6]:
def get_sample_data(data, n = data.shape[0], label_dict = {0:1, 1:1} ,random_state=seed):
    df = pd.DataFrame(columns=data.columns)
    for label,frac in label_dict.items():
        tmp = data[data.labels == label].sample(frac=frac, random_state=random_state)
        df = pd.concat([df,tmp])
    return df.sample(frac=1, random_state=random_state)[:n].reset_index(drop=True)

#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

Numero de datos de entrenamiento: 1000. Numero de datos de test: 8980
Train data label count:
 0:497,    1:503
Test data label count:
 0: 4289,     1:4691


In [7]:
class Model:
    
    def __init__(self, checkpoint, num_labels):
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
        self.trainer = None
        self.training_args = None
    
    def tokenize(self, data):
        return self.tokenizer(data['text'], truncation=True)
    
    def compute_metrics(self, eval_pred):
        metric = load_metric("accuracy")
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return metric.compute(predictions=predictions, references=labels)
    
    def train(self):
        print('Empezando entrenamiento:')
        self.trainer.train()
        print('Fin del Entrenamiento')
        
    def save(self, output_dir):
        print('Guardando modelo...')
        self.trainer.save_model(output_dir)
        print(f'Modelo guardado en {output_dir}')
    
    def get_predictions(self, encoded_texts):
        predictions=[]
        print('Getting predictions...')
        for _,x in tqdm(enumerate(encoded_texts), total=len(encoded_texts)):
            outputs = self.model(**x)
            logits = outputs['logits']
            y_pred = torch.argmax(logits, dim=-1)
            predictions.append(y_pred[0].item()) 
        return predictions

def tokenize_data(data, model):
    dataset = Dataset.from_pandas(data, preserve_index=False)
    return dataset.map(model.tokenize, batched=True, remove_columns='text')

def tokenize_docs(texts, tokenizer):
    return [tokenizer(str(text), truncation=True, return_tensors="pt") for _, text in tqdm(enumerate(texts), total=len(texts))]

def create_training_args(model, output_dir, epochs=num_train_epochs, batch_size=batch_size, 
                      learning_rate=2e-5, weight_decay=0.01, metric='accuracy', strategy = 'epoch', no_cuda=True):
    model.training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,     
        learning_rate=2e-5,                      #The initial learning rate for Adam
        per_device_train_batch_size=batch_size,  #The batch size per GPU/TPU core/CPU for training.
        per_device_eval_batch_size=batch_size,   #The batch size per GPU/TPU core/CPU for evaluation
        load_best_model_at_end=True,
        metric_for_best_model=metric,
        weight_decay=weight_decay, #
        evaluation_strategy=strategy,
        save_strategy=strategy, 
        no_cuda = True
    )
    print('Training args added to the model')
    
def create_trainer(model, enc_train_dataset, enc_test_dataset):
    model.trainer = Trainer(
        model=model.model, 
        args=model.training_args, 
        compute_metrics=model.compute_metrics,
        train_dataset=enc_train_dataset,
        eval_dataset=enc_test_dataset,
        tokenizer=model.tokenizer
    )
    print('Trainer added to the model')

In [8]:
#Training
model = Model(model_checkpoint, NUM_LABELS)
print('Tokenizing training data...')
enc_train_dataset = tokenize_data(train_dataset, model)
print('Tokenizing testing data...')
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

Tokenizing training data...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing testing data...


Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.13955,0.952561
2,No log,0.139145,0.958018
3,No log,0.173704,0.955679


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
  metric = load_metric("accuracy")
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\checkpoint-125
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\checkpoint-125\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\checkpoint-125\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\checkpoint-125\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\checkpoint-125\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/1000


In [9]:
#Evaluation
metric_list = ['accuracy', 'f1', 'precision', 'recall']
def get_metrics(eval_data, model, metric_list=metric_list):
    y_real = eval_data.labels.values.tolist()
    print('Tokenizing docs...')
    enc_eval_data = tokenize_docs(eval_data.text.values, model.tokenizer)
    y_pred = model.get_predictions(enc_eval_data)
    results={}
    print('Getting metrics:')
    for metric in metric_list:
        if metric=='accuracy':
            m = load_metric(metric)
        else:
            m = load_metric(metric, 'macro')
        results[metric] = m.compute(predictions=y_pred, references=y_real)
        print(f'{metric}: {results[metric]}')
    print(f'Confusion matrix:\n {confusion_matrix(y_real, y_pred)}')
    return results

m11 = get_metrics(test_dataset, model)

Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9580178173719376}
f1: {'f1': 0.9600508636219137}
precision: {'precision': 0.95448798988622}
recall: {'recall': 0.9656789597100831}
Confusion matrix:
 [[4073  216]
 [ 161 4530]]


**n_training = 2000**

In [10]:
n_training = 2000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m12 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 2000. Numero de datos de test: 8980
Train data label count:
 0:968,    1:1032
Test data label count:
 0: 4289,     1:4691


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.15.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\Usuario/.cache\huggingface\transformers\0e1bbfd

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.108709,0.965924
2,0.168500,0.176569,0.961247
3,0.168500,0.128167,0.97049


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\checkpoint-250\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\checkpoint-250\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\checkpoint-250\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\checkpoint-500
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\checkpoint-500\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\check

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/2000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.970489977728285}
f1: {'f1': 0.9715268077790911}
precision: {'precision': 0.979419410745234}
recall: {'recall': 0.9637603922404605}
Confusion matrix:
 [[4194   95]
 [ 170 4521]]


**n_training = 5000**

In [11]:
n_training = 5000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m13 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 5000. Numero de datos de test: 8980
Train data label count:
 0:2400,    1:2600
Test data label count:
 0: 4289,     1:4691


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.15.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\Usuario/.cache\huggingface\transformers\0e1bbfd

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1875


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1796,0.089398,0.976392
2,0.0745,0.119214,0.975947
3,0.0308,0.122673,0.977728


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\checkpoint-625
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\checkpoint-625\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\checkpoint-625\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\checkpoint-625\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\checkpoint-625\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\checkpoint-1250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\checkpoint-1250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\che

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_1/5000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.977728285077951}
f1: {'f1': 0.9783970620004321}
precision: {'precision': 0.9916794394569739}
recall: {'recall': 0.9654657855467917}
Confusion matrix:
 [[4251   38]
 [ 162 4529]]


### BERT Model

**n_training=1000**

In [12]:
#Parameters
model_checkpoint = 'bert-base-uncased'
model_name = 'BERT'
n_training = 1000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m21 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 1000. Numero de datos de test: 8980
Train data label count:
 0:497,    1:503
Test data label count:
 0: 4289,     1:4691


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.28149,0.926949
2,No log,0.170406,0.95657
3,No log,0.176841,0.964143


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-125
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-125\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-125\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-125\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-125\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\checkpoint-250\pytorch_model.bin
tokenizer config fil

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/1000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9641425389755011}
f1: {'f1': 0.9650380021715526}
precision: {'precision': 0.9834034078335915}
recall: {'recall': 0.947345981667022}
Confusion matrix:
 [[4214   75]
 [ 247 4444]]


**n_training=2000**

In [13]:
n_training = 2000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m22 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 2000. Numero de datos de test: 8980
Train data label count:
 0:968,    1:1032
Test data label count:
 0: 4289,     1:4691


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.123307,0.967261
2,0.168100,0.212155,0.958352
3,0.168100,0.144527,0.970379


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-250\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-250\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-250\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-500
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-500\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\checkpoint-500\pytorch_model.bin
tokenizer config fil

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/2000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9703786191536748}
f1: {'f1': 0.9711809317443121}
precision: {'precision': 0.9874421678783873}
recall: {'recall': 0.9554465998720955}
Confusion matrix:
 [[4232   57]
 [ 209 4482]]


**n_training=5000**

In [14]:
n_training = 5000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m23 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 5000. Numero de datos de test: 8980
Train data label count:
 0:2400,    1:2600
Test data label count:
 0: 4289,     1:4691


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1875


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1896,0.109704,0.976503
2,0.0696,0.128432,0.978731
3,0.0275,0.147003,0.976949


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-625
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-625\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-625\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-625\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-625\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-1250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-1250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\checkpoint-1250\pytorch_model.bin
tokenizer config 

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_1/5000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9787305122494432}
f1: {'f1': 0.9794025665911787}
precision: {'precision': 0.9910519423832388}
recall: {'recall': 0.9680238755062887}
Confusion matrix:
 [[4248   41]
 [ 150 4541]]


### RoBERTa Model

**n_training=1000**

In [15]:
#Parameters
model_checkpoint = 'roberta-base'
model_name = 'RoBERTa'
n_training = 1000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m31 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 1000. Numero de datos de test: 8980
Train data label count:
 0:497,    1:503
Test data label count:
 0: 4289,     1:4691


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.14658,0.965702
2,No log,0.18876,0.963363
3,No log,0.162409,0.972272


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-125
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-125\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-125\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-125\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-125\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\checkpoint-250\pytorch_model.

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/1000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.972271714922049}
f1: {'f1': 0.973228685087625}
precision: {'precision': 0.9817787418655097}
recall: {'recall': 0.9648262630569175}
Confusion matrix:
 [[4205   84]
 [ 165 4526]]


**n_training=2000**

In [16]:
n_training = 2000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m32 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 2000. Numero de datos de test: 8980
Train data label count:
 0:968,    1:1032
Test data label count:
 0: 4289,     1:4691


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.082833,0.979176
2,0.149300,0.109491,0.978731
3,0.149300,0.089002,0.983185


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-250\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-250\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-250\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-500
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-500\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\checkpoint-500\pytorch_model.

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/2000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.983184855233853}
f1: {'f1': 0.9838069705093834}
precision: {'precision': 0.9898575744497194}
recall: {'recall': 0.9778298870176935}
Confusion matrix:
 [[4242   47]
 [ 104 4587]]


**n_training=5000**

In [17]:
n_training = 5000
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}/{n_training}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m33 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 5000. Numero de datos de test: 8980
Train data label count:
 0:2400,    1:2600
Test data label count:
 0: 4289,     1:4691


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1875


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1957,0.095363,0.983073
2,0.0752,0.06597,0.98686
3,0.0493,0.067767,0.988419


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-625
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-625\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-625\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-625\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-625\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-1250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-1250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\checkpoint-1250\pytorch_mod

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_1/5000
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9884187082405346}
f1: {'f1': 0.9888650963597431}
precision: {'precision': 0.9933318993331899}
recall: {'recall': 0.9844382860797272}
Confusion matrix:
 [[4258   31]
 [  73 4618]]
