# Finetuning using Huggingface e2e

In [3]:
import os
import aiohttp

import pandas as pd
import numpy as np
import torch

import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)

from imblearn.over_sampling import RandomOverSampler

import os
os.environ['TRANSFORMERS_CACHE'] = 'data/volume_1/cache_hf'
os.environ['HF_HOME'] = 'data/volume_1/cache_hf'

In [4]:
#!sudo kill -23929 pid
#!sudo kill -28672 pid
# torch.cuda.empty_cache()
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f20892659f0>

In [5]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: GeForce RTX 2080 Ti


## Load processed labeled data

In [3]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

DECADE = "1970s"
TYPE = "coal"
MAX_LENGHT = 512
DATA_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged_split/"

checkpoint = "wietsedv/bert-base-dutch-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

In [4]:
#df = pd.read_csv(os.path.join(DATA_DIR, f"{DECADE}_{TYPE}_merged_split.csv"))

## Load Dataset

In [11]:
from datasets import load_dataset

def clean_data(raw_datasets):
    raw_datasets = raw_datasets.remove_columns(['Unnamed: 0', 'Unnamed: 0.1','article_name', 'text'])
    raw_datasets = raw_datasets.rename_column('labels', 'label')
    raw_datasets = raw_datasets.rename_column('text_split', 'text')

    raw_datasets = raw_datasets.train_test_split(test_size=0.1)
    
    return raw_datasets

In [12]:
def tokenize_function(row):
    return tokenizer(
        row["text"],
        truncation=True,
)

def tokenize_data(raw_datasets):
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    return tokenized_datasets

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=MAX_LENGHT)

# Check
samples = tokenized_datasets["train"][:8]
samples = {
    k: v for k, v in samples.items() if k not in ["text"]
}
[len(x) for x in samples["input_ids"]]
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 51]),
 'input_ids': torch.Size([8, 51]),
 'token_type_ids': torch.Size([8, 51]),
 'labels': torch.Size([8])}

# Training

In [13]:
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments(
    output_dir = "/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs",
    num_train_epochs=1,              # total number of training epochs
    evaluation_strategy="steps",
    logging_dir="~/dev/hist-aware/notebooks/logging",
    logging_steps=20,
    load_best_model_at_end=True,  
    seed=2020,
    #label_names=["label"], # check this
    disable_tqdm=False
)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

In [14]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "cola")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss,Matthews Correlation,Runtime,Samples Per Second
20,0.7913,0.582832,0.0,3.64,81.869
40,0.5746,0.628829,0.0,2.9304,101.692
60,0.7398,0.630543,0.0,3.7471,79.529
80,0.6653,0.597257,0.0,3.357,88.769
100,0.4383,0.680176,0.0,2.7471,108.478
120,0.6233,0.569953,0.0,3.5528,83.878
140,0.5566,0.58724,0.0,3.0777,96.825
160,0.7172,0.542909,0.0,3.5727,83.41
180,0.5045,0.556724,0.0,3.3306,89.473
200,0.598,0.52476,0.0,3.3052,90.161


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


TrainOutput(global_step=336, training_loss=0.5760182312556675, metrics={'train_runtime': 283.7545, 'train_samples_per_second': 1.184, 'total_flos': 99139897752840.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': -92377088, 'init_mem_gpu_alloc_delta': 436978176, 'init_mem_cpu_peaked_delta': 92377088, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 152940544, 'train_mem_gpu_alloc_delta': 1758761984, 'train_mem_cpu_peaked_delta': 290189312, 'train_mem_gpu_peaked_delta': 727300608})

In [16]:
model.save_pretrained("/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/finetuned-models")

## Prediction

Now we predict the selected data for this given 

# Fine tune multiple models

In [6]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

DATA_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged_split/"

checkpoint = "wietsedv/bert-base-dutch-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

In [8]:
import os

DECADES = ["1960s", "1970s", "1980s", "1990s"]
TYPES = ["coal", "gas", "oil"]

for DECADE in DECADES:
    for TYPE in TYPES:
        # Load and clean dataset
        NAME = f"{DECADE}_{TYPE}"
        
        os.mkdir(f"/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/{NAME}")
        
        raw_datasets = load_dataset(
            'csv',
            data_files=os.path.join(DATA_DIR, f"{DECADE}_{TYPE}_merged_split.csv"), split="train"
        )
        raw_datasets = clean_data(raw_datasets)
        tokenized_datasets = tokenize_data(raw_datasets)
        
        # Create data collator
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=MAX_LENGHT)
        
        training_args = TrainingArguments(
            output_dir = "/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs",
            num_train_epochs=4,              # total number of training epochs
            evaluation_strategy="steps",
            logging_dir="~/dev/hist-aware/notebooks/logging",
            logging_steps=20,
            load_best_model_at_end=True,  
            seed=2020,
            #label_names=["label"], # check this
            disable_tqdm=False
        )

        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
        
        trainer = Trainer(
            model,
            training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        trainer.train()
        
        model.save_pretrained(f"/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/{NAME}")

FileNotFoundError: [Errno 2] No such file or directory: '/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged_split/1960s_coal_merged_split.csv'