In [1]:
#imports
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from datasets import Dataset, load_metric
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer

In [13]:
#Parameters
NUM_LABELS=2
keep_cols=['text','labels']
model_checkpoint = 'distilbert-base-uncased'
model_name = 'distilBERT'
training_data='ISOT'
text_col='text'
max_length = 512
batch_size = 8 
num_train_epochs=3 
n_training = 2000
seed=101
experiment = 2
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}')

### LOADING DATA

In [14]:
#Cargamos los datos
true_path = os.path.join(os.getcwd(), f'data/{training_data}/True.csv')
fake_path = os.path.join(os.getcwd(), f'data/{training_data}/Fake.csv')

true_dataset = pd.read_csv(true_path)
fake_dataset = pd.read_csv(fake_path)
print(f'Number of True examples: {true_dataset.shape[0]}')
print(f'Number of Fake examples: {fake_dataset.shape[0]}')
print(f'Columns:{true_dataset.columns.to_list()}')
print(f'Most repeated subjects:\n \
    True:  {true_dataset.subject.value_counts().index.to_list()[:10]}\n\
     False: {fake_dataset.subject.value_counts().index.to_list()[:10]}')
print(f'Average number of words in titles:\n\
      True:  {round(true_dataset.title.apply(lambda x: len(x)).mean())},\n\
      Fake:  {round(fake_dataset.title.apply(lambda x: len(x)).mean())}')
print(f'Average number of words in texts: \n\
      True:  {round(true_dataset.text.apply(lambda x: len(x)).mean())},\n\
      Fake:  {round(fake_dataset.text.apply(lambda x: len(x)).mean())}')

def print_row(input_df: pd.DataFrame, index: int, label: int) -> None:
    if label == 0:
        print(f"Label: True")
    else:
        print(f"Label: False")
    print(f"Title: {input_df.iat[index, 0]}")
    print(f"Text: {input_df.iat[index, 1][:200]}...")
    print(f"Subject: {input_df.iat[index, 2]}")
    print(f"date: {input_df.iat[index, 3]}\n")

print('\nPrinting some examples:\n')
print_row(input_df=fake_dataset, index=0, label=1)
print_row(input_df=true_dataset, index=0, label=0)

Number of True examples: 21417
Number of Fake examples: 23481
Columns:['title', 'text', 'subject', 'date']
Most repeated subjects:
     True:  ['politicsNews', 'worldnews']
     False: ['News', 'politics', 'left-news', 'Government News', 'US_News', 'Middle-east']
Average number of words in titles:
      True:  65,
      Fake:  94
Average number of words in texts: 
      True:  2383,
      Fake:  2547

Printing some examples:

Label: False
Title:  Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing
Text: Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former rea...
Subject: News
date: December 31, 2017

Label: True
Title: As U.S. budget fight looms, Republicans flip their fiscal script
Text: WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansio

In [15]:
def preprocess_data(fake, true, text_col=text_col, random_state=seed):
    #Añadimos la columna label
    true['labels'] = 0
    fake['labels'] = 1
    data = pd.concat([true, fake], axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True) #shuffle
    #Pasamos los textos a minúscula:
    data[text_col] = data[text_col].apply(lambda x: x.lower() if isinstance(x, str) else x)
    data = data[[text_col,'labels']].rename(columns={text_col:'text'})
    return data

data = preprocess_data(fake_dataset, true_dataset)
print(f'{text_col} column word count description:')
data.text.apply(lambda x: len(x)).describe()

text column word count description:


count    44898.000000
mean      2469.109693
std       2171.617091
min          1.000000
25%       1234.000000
50%       2186.000000
75%       3105.000000
max      51794.000000
Name: text, dtype: float64

In [16]:
#Separamos en entrenamiento y test

def split_data(data, test_size=0.2, random_state=seed):
    X = data['text']
    y = data['labels']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = test_size, random_state = random_state)
    train_dataset = pd.concat([X_train,y_train], axis=1).reset_index(drop=True)
    test_dataset = pd.concat([X_test,y_test], axis=1).reset_index(drop=True)
    print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
    print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
    print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')
    return train_dataset, test_dataset

train_data, test_data = split_data(data, 0.2)

Numero de datos de entrenamiento: 35918. Numero de datos de test: 8980
Train data label count:
 0:17128,    1:18790
Test data label count:
 0: 4289,     1:4691


In [17]:
def get_sample_data(data, n = data.shape[0], label_dict = {0:1, 1:1} ,random_state=seed):
    df = pd.DataFrame(columns=data.columns)
    for label,frac in label_dict.items():
        tmp = data[data.labels == label].sample(frac=frac, random_state=random_state)
        df = pd.concat([df,tmp])
    return df.sample(frac=1, random_state=random_state)[:n].reset_index(drop=True)

#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

Numero de datos de entrenamiento: 2000. Numero de datos de test: 8980
Train data label count:
 0:968,    1:1032
Test data label count:
 0: 4289,     1:4691


In [18]:
class Model:
    
    def __init__(self, checkpoint, num_labels):
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
        self.trainer = None
        self.training_args = None
    
    def tokenize(self, data):
        return self.tokenizer(data['text'], truncation=True)
    
    def compute_metrics(self, eval_pred):
        metric = load_metric("accuracy")
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return metric.compute(predictions=predictions, references=labels)
    
    def train(self):
        print('Empezando entrenamiento:')
        self.trainer.train()
        print('Fin del Entrenamiento')
        
    def save(self, output_dir):
        print('Guardando modelo...')
        self.trainer.save_model(output_dir)
        print(f'Modelo guardado en {output_dir}')
    
    def get_predictions(self, encoded_texts):
        predictions=[]
        print('Getting predictions...')
        for _,x in tqdm(enumerate(encoded_texts), total=len(encoded_texts)):
            outputs = self.model(**x)
            logits = outputs['logits']
            y_pred = torch.argmax(logits, dim=-1)
            predictions.append(y_pred[0].item()) 
        return predictions

def tokenize_data(data, model):
    dataset = Dataset.from_pandas(data, preserve_index=False)
    return dataset.map(model.tokenize, batched=True, remove_columns='text')

def tokenize_docs(texts, tokenizer):
    return [tokenizer(str(text), truncation=True, return_tensors="pt") for _, text in tqdm(enumerate(texts), total=len(texts))]

def create_training_args(model, output_dir, epochs=num_train_epochs, batch_size=batch_size, 
                      learning_rate=2e-5, weight_decay=0.01, metric='accuracy', strategy = 'epoch', no_cuda=True):
    model.training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,     
        learning_rate=2e-5,                      #The initial learning rate for Adam
        per_device_train_batch_size=batch_size,  #The batch size per GPU/TPU core/CPU for training.
        per_device_eval_batch_size=batch_size,   #The batch size per GPU/TPU core/CPU for evaluation
        load_best_model_at_end=True,
        metric_for_best_model=metric,
        weight_decay=weight_decay, #
        evaluation_strategy=strategy,
        save_strategy=strategy, 
        no_cuda = True
    )
    print('Training args added to the model')
    
def create_trainer(model, enc_train_dataset, enc_test_dataset):
    model.trainer = Trainer(
        model=model.model, 
        args=model.training_args, 
        compute_metrics=model.compute_metrics,
        train_dataset=enc_train_dataset,
        eval_dataset=enc_test_dataset,
        tokenizer=model.tokenizer
    )
    print('Trainer added to the model')

In [8]:
#Training
model = Model(model_checkpoint, NUM_LABELS)
print('Tokenizing training data...')
enc_train_dataset = tokenize_data(train_dataset, model)
print('Tokenizing testing data...')
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

Tokenizing training data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing testing data...


Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 2000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.004404,0.999109
2,0.035500,0.003687,0.999443
3,0.035500,0.002311,0.999666


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
  metric = load_metric("accuracy")
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint-250\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint-250\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint-250\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint-500
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint-500\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\checkpoint

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2


In [20]:
#Evaluation
metric_list = ['accuracy', 'f1', 'precision', 'recall']
def get_metrics(eval_data, model, metric_list=metric_list):
    y_real = eval_data.labels.values.tolist()
    print('Tokenizing docs...')
    enc_eval_data = tokenize_docs(eval_data.text.values, model.tokenizer)
    y_pred = model.get_predictions(enc_eval_data)
    results={}
    print('Getting metrics:')
    for metric in metric_list:
        if metric=='accuracy':
            m = load_metric(metric)
        else:
            m = load_metric(metric, 'macro')
        results[metric] = m.compute(predictions=y_pred, references=y_real)
        print(f'{metric}: {results[metric]}')
    print(f'Confusion matrix:\n {confusion_matrix(y_real, y_pred)}')
    return results

model = Model(output_dir, NUM_LABELS) ##########################
m1 = get_metrics(test_dataset, model)

Didn't find file C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\added_tokens.json. We won't load it.
loading file C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\vocab.txt
loading file C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\tokenizer.json
loading file None
loading file C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\special_tokens_map.json
loading file C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\tokenizer_config.json
loading configuration file C:\Users\Usuario\MASTER\TFM\models/distilBERT/EXP_2\config.json
Model config DistilBertConfig {
  "_name_or_path": "C:\\Users\\Usuario\\MASTER\\TFM\\models/distilBERT/EXP_2",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "si

Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9996659242761693}
f1: {'f1': 0.9996802046690118}
precision: {'precision': 0.9997867803837953}
recall: {'recall': 0.9995736516734172}
Confusion matrix:
 [[4288    1]
 [   2 4689]]


### BERT Model


In [21]:
#Parameters
model_checkpoint = 'bert-base-uncased'
model_name = 'BERT'
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training)
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m2 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 2000. Numero de datos de test: 8980
Train data label count:
 0:968,    1:1032
Test data label count:
 0: 4289,     1:4691


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.002747,0.999443
2,0.031900,0.00108,0.999889
3,0.031900,0.001054,0.999889


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-250\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-250\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-250\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-500
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-500\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\checkpoint-500\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\m

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/BERT/EXP_2
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9998886414253898}
f1: {'f1': 0.9998934242779495}
precision: {'precision': 0.9997868712702472}
recall: {'recall': 1.0}
Confusion matrix:
 [[4288    1]
 [   0 4691]]


### RoBERTa Model


In [22]:
#Parameters
model_checkpoint = 'roberta-base'
model_name = 'RoBERTa'
output_dir = os.path.join(os.getcwd(), f'models/{model_name}/EXP_{experiment}')

#Train-test split
#train_dataset = get_sample_data(train_data, n_training, label_dict = {0:0.5, 1:1}) 
train_dataset = get_sample_data(train_data, n_training) 
test_dataset = get_sample_data(test_data)
print("Numero de datos de entrenamiento: {}. Numero de datos de test: {}".format(len(train_dataset), len(test_dataset)))
print(f'Train data label count:\n 0:{train_dataset[train_dataset.labels==0].shape[0]},\
    1:{train_dataset[train_dataset.labels==1].shape[0]}')
print(f'Test data label count:\n 0: {test_dataset[test_dataset.labels==0].shape[0]}, \
    1:{test_dataset[test_dataset.labels==1].shape[0]}')

#Training
model = Model(model_checkpoint, NUM_LABELS)
enc_train_dataset = tokenize_data(train_dataset, model)
enc_test_dataset = tokenize_data(test_dataset, model)
create_training_args(model, output_dir)
create_trainer(model, enc_train_dataset, enc_test_dataset)
model.train()
model.save(output_dir)

#Evaluation
m3 = get_metrics(test_dataset, model)

Numero de datos de entrenamiento: 2000. Numero de datos de test: 8980
Train data label count:
 0:968,    1:1032
Test data label count:
 0: 4289,     1:4691


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Usuario/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Training args added to the model
Trainer added to the model
Empezando entrenamiento:


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.002269,0.999666
2,0.044500,0.002872,0.999555
3,0.044500,0.00316,0.999555


***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-250
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-250\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-250\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-250\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-250\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 8980
  Batch size = 8
Saving model checkpoint to C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-500
Configuration saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-500\config.json
Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\checkpoint-500\pytorch_model.bin
tokenizer config file saved in C:\Us

Fin del Entrenamiento
Guardando modelo...


Model weights saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\pytorch_model.bin
tokenizer config file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\tokenizer_config.json
Special tokens file saved in C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2\special_tokens_map.json


Modelo guardado en C:\Users\Usuario\MASTER\TFM\models/RoBERTa/EXP_2
Tokenizing docs...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting predictions...


  0%|          | 0/8980 [00:00<?, ?it/s]

Getting metrics:
accuracy: {'accuracy': 0.9996659242761693}
f1: {'f1': 0.9996802728338485}
precision: {'precision': 0.9995737425404945}
recall: {'recall': 0.9997868258367086}
Confusion matrix:
 [[4287    2]
 [   1 4690]]
