# Finetuning using Huggingface e2e

In [1]:
import os
import aiohttp

import pandas as pd
import numpy as np
import torch

import transformers
import tensorflow as tf
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)

from imblearn.over_sampling import RandomOverSampler

import os
os.environ['TRANSFORMERS_CACHE'] = 'data/volume_1/cache_hf'
os.environ['HF_HOME'] = 'data/volume_1/cache_hf'

In [2]:
#!sudo kill -23929 pid
#!sudo kill -28672 pid
# torch.cuda.empty_cache()
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f5a83668a10>

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: GeForce RTX 2080 Ti


## Load processed labeled data

In [4]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

MAX_LENGHT = 512
DATA_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged_split/"

checkpoint = "wietsedv/bert-base-dutch-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

### Helper: tokenizer

In [5]:
from transformers import DataCollatorWithPadding

def tokenize_function(row):
    return tokenizer(
        row["text"],
        truncation=True,
)

def tokenize_data(raw_datasets):
    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    return tokenized_datasets

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=MAX_LENGHT)

### Helper: metrics

In [6]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_preds):
    metric = load_metric("glue", "cola")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics_many(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    
    accuracy = CategoricalAccuracy(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "f1": f1}

### Helper: prediction

In [7]:
from transformers import pipeline
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
    
def show_confusion_matrix(confusion_matrix):
    
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment')
    
def predict_test(model, test_dataset): 

    # Use pipeline for easyness
    text_classification = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer
    )

    # Predict labels test
    preds = []
    for text in test_dataset:
        pred = text_classification(test_dataset["text"])
        preds.append(pred)

    # Cleanup predictions
    y_preds = []
    for x in preds:
        if x[0]["label"] == "LABEL_0":
            y_pred = 0
        if x[0]["label"] == "LABEL_1":
            y_pred = 1
        if x[0]["label"] == "LABEL_2":
            y_pred = 2
    y_preds.append(y_pred)
    
    # Gather ground truth test
    y_test = test_dataset["test"]["label"]

    # Build outpus
    class_names = ["Negative", "Neutral", "Positive"]
    cm = confusion_matrix(y_val, y_preds)
    df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
    conf_matrix = show_confusion_matrix(df_cm)
    class_rep = classification_report(y_val, y_preds, target_names=class_names)
    
    return(conf_matrix, class_rep)

# Fine tune multiple models

In [8]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

DATA_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged_split/"

checkpoint = "wietsedv/bert-base-dutch-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

## Finetune: Type per decade (12 models)

In [None]:
%%time 

import os
import datasets
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback


DECADES = ["1960s", "1970s", "1980s", "1990s"]
TYPES = ["oil", "gas", "coal"]

for DECADE in DECADES:
    for TYPE in TYPES:
        # Load dataset
        NAME = f"{DECADE}_{TYPE}"
        
        try:
            os.mkdir(f"/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/{NAME}")
        except:
            continue
        
        DIR_MODEL = f"/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/{NAME}"
        
        df = pd.read_csv(os.path.join(DATA_DIR, f"{DECADE}_{TYPE}_merged_split.csv")).sample(frac=1)
        # Create training, validation and test
        # Divide into train and val
        train_texts, val_texts, train_labels, val_labels = train_test_split(list(df.text_split), list(df.labels), test_size=.2)
        train_dataset = pd.DataFrame()
        train_dataset["text"] = train_texts
        train_dataset["labels"] = train_labels

        # Divide val into val and test
        test_texts, val_texts, test_labels, val_labels = train_test_split(val_texts, val_labels, test_size=.5)
        validation_dataset = pd.DataFrame()
        validation_dataset["text"] = val_texts
        validation_dataset["labels"] = val_labels

        test_dataset = pd.DataFrame()
        test_dataset["text"] = test_texts
        test_dataset["labels"] = test_labels
        
        # Transform into Dataset from hf
        train_dataset_raw = Dataset.from_pandas(train_dataset)
        train_dataset_raw = train_dataset_raw.rename_column('labels', 'label')

        validation_dataset_raw = Dataset.from_pandas(validation_dataset)
        validation_dataset_raw = validation_dataset_raw.rename_column('labels', 'label')

        test_dataset_raw = Dataset.from_pandas(test_dataset)
        test_dataset_raw = test_dataset_raw.rename_column('labels', 'label')
        
        # Tokenize the datasets
        training_dataset_tokenized = tokenize_data(train_dataset_raw)
        validation_dataset_tokenized = tokenize_data(validation_dataset_raw)
        test_dataset_tokenized = tokenize_data(test_dataset_raw)
        
        # Save to disk
        os.mkdir(os.path.join(DIR_MODEL, "datasets"))
        DIR_MODEL_DATASET = os.path.join(DIR_MODEL, "datasets")

        os.mkdir(os.path.join(DIR_MODEL_DATASET, "training"))
        training_dataset_tokenized.save_to_disk(os.path.join(DIR_MODEL_DATASET, "training"))

        os.mkdir(os.path.join(DIR_MODEL_DATASET, "validation"))
        validation_dataset_tokenized.save_to_disk(os.path.join(DIR_MODEL_DATASET, "validation"))

        os.mkdir(os.path.join(DIR_MODEL_DATASET, "test"))
        test_dataset_tokenized.save_to_disk(os.path.join(DIR_MODEL_DATASET, "test"))

        
        # Create data collator
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=MAX_LENGHT)
        
        training_args = TrainingArguments(
            output_dir = DIR_MODEL,
            num_train_epochs=4,              # total number of training epochs
            evaluation_strategy="steps",
            logging_dir="/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/logging",
            load_best_model_at_end=True,  
            seed=2020,
            #label_names=["label"], # check this
            disable_tqdm=False
        )

        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
        
        trainer = Trainer(
            model,
            training_args,
            train_dataset=training_dataset_tokenized,
            eval_dataset=validation_dataset_tokenized,
            data_collator=data_collator,
            tokenizer=tokenizer,
        )

        trainer.train()
        
        model.save_pretrained(DIR_MODEL)
        
        # Predict on test
        #cm, report = predict_test(f"/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/{NAME}", test_dataset_tokenized)
        
        #print(cm)
        #print(report)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.8884,1.085119,4.0453,87.757
1000,0.3848,1.78309,4.9415,71.841


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.9028,1.088347,0.7485,351.37
1000,0.2361,1.995547,0.6983,376.61


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.918,0.794153,1.7134,350.756
1000,0.5754,0.929834,1.7201,349.395
1500,0.3512,1.404337,1.6603,361.977
2000,0.1833,1.936439,1.6539,363.378


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.5059,1.898302,0.4775,362.326


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.4635,0.853577,0.8259,360.832
1000,0.1472,1.26173,0.7954,374.649


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.9469,0.885142,4.0302,146.642
1000,0.6133,0.943495,3.8217,154.645
1500,0.3363,1.628051,3.2585,181.371
2000,0.2079,1.879899,3.1928,185.104


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.8777,0.890972,1.5031,359.928
1000,0.4933,0.953035,1.5207,355.754
1500,0.2664,1.348453,1.4757,366.599
2000,0.1642,1.629404,1.4624,369.932


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.7386,0.936921,1.8419,154.73
1000,0.2865,1.342966,3.3234,85.755


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.4582,1.350267,0.3726,359.596


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss


## Finetune: Only types (3 models)

In [None]:
%%time 

import os
import datasets
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback


DECADES = ["1960s", "1970s", "1980s", "1990s"]
TYPES = ["oil", "gas", "coal"]

for TYPE in TYPES:
    # Load dataset
    NAME = f"{TYPE}"
    
    try:
        os.mkdir(f"/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/model-per-type/{NAME}")
    except :
        continue
    
    DIR_MODEL = f"/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/model-per-type/{NAME}"
    df_list = []
    for DECADE in DECADES:
        temp = pd.read_csv(os.path.join(DATA_DIR, f"{DECADE}_{TYPE}_merged_split.csv"))
        df_list.append(temp)
    df = pd.concat(df_list)

    # Create training, validation and test
    # Divide into train and val
    train_texts, val_texts, train_labels, val_labels = train_test_split(list(df.text_split), list(df.labels), test_size=.2)
    train_dataset = pd.DataFrame()
    train_dataset["text"] = train_texts
    train_dataset["labels"] = train_labels

    # Divide val into val and test
    test_texts, val_texts, test_labels, val_labels = train_test_split(val_texts, val_labels, test_size=.5)
    validation_dataset = pd.DataFrame()
    validation_dataset["text"] = val_texts
    validation_dataset["labels"] = val_labels

    test_dataset = pd.DataFrame()
    test_dataset["text"] = test_texts
    test_dataset["labels"] = test_labels

    # Transform into Dataset from hf
    train_dataset_raw = Dataset.from_pandas(train_dataset)
    train_dataset_raw = train_dataset_raw.rename_column('labels', 'label')

    validation_dataset_raw = Dataset.from_pandas(validation_dataset)
    validation_dataset_raw = validation_dataset_raw.rename_column('labels', 'label')

    test_dataset_raw = Dataset.from_pandas(test_dataset)
    test_dataset_raw = test_dataset_raw.rename_column('labels', 'label')

    # Tokenize the datasets
    training_dataset_tokenized = tokenize_data(train_dataset_raw)
    validation_dataset_tokenized = tokenize_data(validation_dataset_raw)
    test_dataset_tokenized = tokenize_data(test_dataset_raw)

    # Save to disk
    os.mkdir(os.path.join(DIR_MODEL, "datasets"))
    DIR_MODEL_DATASET = os.path.join(DIR_MODEL, "datasets")
    
    os.mkdir(os.path.join(DIR_MODEL_DATASET, "training"))
    training_dataset_tokenized.save_to_disk(os.path.join(DIR_MODEL_DATASET, "training"))

    os.mkdir(os.path.join(DIR_MODEL_DATASET, "validation"))
    validation_dataset_tokenized.save_to_disk(os.path.join(DIR_MODEL_DATASET, "validation"))

    os.mkdir(os.path.join(DIR_MODEL_DATASET, "test"))
    test_dataset_tokenized.save_to_disk(os.path.join(DIR_MODEL_DATASET, "test"))

    # Create data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=MAX_LENGHT)

    training_args = TrainingArguments(
        output_dir = DIR_MODEL,
        num_train_epochs=4,              # total number of training epochs
        evaluation_strategy="steps",
        logging_dir="/home/leonardovida/data/volume_1/delphbert-results/6-finetuning-outputs/model-per-type/logging",
        load_best_model_at_end=True,  
        seed=2020,
        #label_names=["label"], # check this
        disable_tqdm=False
    )

    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=training_dataset_tokenized,
        eval_dataset=validation_dataset_tokenized,
        data_collator=data_collator,
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    model.save_pretrained(DIR_MODEL)

    # Predict on test
    #cm, report = predict_test(DIR_MODEL, test_dataset_tokenized)
    #print(cm)
    #print(report)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.978,0.93705,3.2214,373.132
1000,0.9247,0.987756,3.2292,372.233
1500,0.7248,1.115691,3.2358,371.468
2000,0.6222,1.070755,3.2907,365.276


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
500,0.9395,0.886555,2.7334,379.388
