In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, BertPreTrainedModel, BertTokenizerFast, AdamW, BertConfig, GPT2ForSequenceClassification, GPT2TokenizerFast, get_linear_schedule_with_warmup, TrainingArguments, Trainer

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, data, labeled=True):
        self.data_ = data[['text', 'label']].copy().reset_index()
        self.labeled =  labeled
            
    def __len__(self):
        return self.data_.shape[0]
    
    def __getitem__(self, idx):
        inputs = tokenizer.encode_plus(
            str(self.data_.loc[idx]['text']),
            None,
            add_special_tokens=True,
            max_length=512,
            truncation=True, 
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        if self.labeled:
            return {
                'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long), 
                'labels': torch.tensor(self.data_.loc[idx]['label'], dtype=torch.long)
            }
        
        else:
            return {
                'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long)
            }

In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


model_bert = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=2
)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=16,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model_bert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=TextDataset(train),
    eval_dataset=TextDataset(test)
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

NameError: name 'train' is not defined

In [6]:
trainer.evaluate()

{'eval_loss': 1.5158703327178955,
 'eval_accuracy': 0.768,
 'eval_f1': 0.7851851851851852,
 'eval_precision': 0.7162162162162162,
 'eval_recall': 0.8688524590163934,
 'epoch': 16.0}

BERT. 2 epoch 8 batch size

{'eval_loss': 0.6006467342376709,
 'eval_accuracy': 0.728,
 'eval_f1': 0.673076923076923,
 'eval_precision': 0.813953488372093,
 'eval_recall': 0.5737704918032787,
 'epoch': 2.0}

BERT. 4 epoch 32 batch size

{'eval_loss': 0.6244660019874573,
 'eval_accuracy': 0.648,
 'eval_f1': 0.639344262295082,
 'eval_precision': 0.639344262295082,
 'eval_recall': 0.639344262295082,
 'epoch': 4.0}
 
 
BERT. 4 epoch 8 batch size
{'eval_loss': 0.5174663662910461,
 'eval_accuracy': 0.768,
 'eval_f1': 0.7289719626168225,
 'eval_precision': 0.8478260869565217,
 'eval_recall': 0.639344262295082,
 'epoch': 4.0}

BERT. 8 epoch 16 batch size
{'eval_loss': 0.4260500967502594,
 'eval_accuracy': 0.816,
 'eval_f1': 0.7850467289719626,
 'eval_precision': 0.8936170212765957,
 'eval_recall': 0.7,
 'epoch': 8.0} (training_loss=0.4773511091868083)
 
BERT. 16 epoch 16 batch size
{'eval_loss': 1.5158703327178955,
 'eval_accuracy': 0.768,
 'eval_f1': 0.7851851851851852,
 'eval_precision': 0.7162162162162162,
 'eval_recall': 0.8688524590163934,
 'epoch': 16.0} training_loss=0.24790531396865845)

In [6]:
files = ['../data/celeb/train_bueno.csv']
results = pd.DataFrame(columns=['file', 'step', 'epochs', 'eval_loss', 'eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall'])
for f in files:
    data = pd.read_csv(f)
    train, test = train_test_split(data, train_size=.8)
    train, val = train_test_split(train, train_size=.8)
    aux = {'file': f}
    for epochs in range(1, 10, 2):
        aux.update({'epochs': epochs})
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=epochs,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=32,
            warmup_steps=500,
            weight_decay=0.01,
            evaluate_during_training=True,
            logging_dir='./logs'
        )

        trainer = Trainer(
            model=model_bert,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=TextDataset(train)
        )

        trainer.train()
        aux.update({'step': 'val'})
        p = trainer.predict(TextDataset(val))
        aux.update(p.metrics)
        aux.update({'step': 'test'})
        p = trainer.predict(TextDataset(test))
        aux.update(p.metrics)
        results = results.append(aux, ignore_index=True)



Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


In [7]:
results.to_csv()

Unnamed: 0,file,step,epochs,eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall
0,../data/celeb/train_bueno.csv,test,1,0.724681,0.51,0.395062,0.457143,0.347826
1,../data/celeb/train_bueno.csv,test,3,0.584932,0.72,0.72549,0.660714,0.804348
2,../data/celeb/train_bueno.csv,test,5,1.032369,0.75,0.691358,0.8,0.608696
3,../data/celeb/train_bueno.csv,test,7,1.191203,0.75,0.683544,0.818182,0.586957
4,../data/celeb/train_bueno.csv,test,9,1.205576,0.8,0.777778,0.795455,0.76087


### BREXIT

In [49]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, data, labeled=True):
        self.data_ = data[['text', 'label']].copy().reset_index()
        self.labeled =  labeled
            
    def __len__(self):
        return self.data_.shape[0]
    
    def __getitem__(self, idx):
        inputs = tokenizer.encode_plus(
            str(self.data_.loc[idx]['text']),
            None,
            add_special_tokens=True,
            max_length=512,
            truncation=True, 
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        if self.labeled:
            return {
                'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long), 
                'labels': torch.tensor(self.data_.loc[idx]['label'], dtype=torch.long)
            }
        
        else:
            return {
                'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long)
            }


files = ['../data/brexit/test_02oct.csv', '../data/brexit/test_08oct.csv', '../data/brexit/test_26sept.csv']
results = pd.DataFrame(columns=['file', 'epochs', 'eval_loss', 'eval_accuracy', 'eval_f1', 'eval_precision', 'eval_recall'])
train = pd.read_csv('../data/brexit/train_bueno.csv')
train = train[~train['label'].isna()]
for epochs in range(1, 10, 2):
    aux = {'epochs': epochs}
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=32,
        warmup_steps=500,
        weight_decay=0.01,
        evaluate_during_training=True,
        logging_dir='./logs'
    )

    trainer = Trainer(
        model=model_bert,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=TextDataset(train)
    )

    trainer.train()
    for f in files:
        aux.update({'file': f})
        test = pd.read_csv(f)
        test = test[~test['label'].isna()]
        p = trainer.predict(TextDataset(test))
        aux.update(p.metrics)
        results = results.append(aux, ignore_index=True)



Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
