In [None]:
%pip install transformers datasets

# You may need to run this line if you receive an error in cell [18]
#%pip install accelerate -U

In [2]:
# We ran this script on Google Colab, but if you are attempting to run it locally 
# you will want the next two lines commented out to prevent error

#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp scripts/Widman_Wichl_training.py .

# commented out the following two lines which are unnecessary if data folder is already in this directory (which it wasn't prior to putting this on Github)
#!mkdir data
#!cp data/issues_tv_fb_18_20.csv data/

!mkdir results
!mkdir logs
!mkdir models

In [None]:
#--------------------------------------------
# Run from here when restarting

In [4]:
import json
import pickle
import subprocess
import time
import os
import sys

import datasets
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import Dataset
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.trainer_callback import EarlyStoppingCallback


current_dir = os.path.dirname(os.path.abspath(__file__))
scripts_dir = os.path.join(current_dir, 'scripts')
sys.path.append(scripts_dir)

import Widman_Wichl_training as tr

pd.set_option("display.precision", 3)

In [5]:
!rm -rf models
!rm -rf results
!mkdir models
!mkdir results

In [6]:
#import time
#start_time = time.time()

### Settings

Experiments:
- Dropout 0.2 (instead of 0.1 default) doesnt help with overfitting
- weight decay 0 (which is default, but the paper uses 0.01) makes no difference
- google/electra-small-discriminator compared to distilbert-base-uncased is terrible (0.09 avg F1 score on test set)
- increasing batch size only speeds up training a little
- smaller models (i.e. electra) allow for larger batch sizes before running oom (i.e. roberta for example doesn't work with 32)
- roberta is mean f1 0.477 -- so not quite as good as distilbert

In [7]:
MODEL_NAME = "distilbert-base-uncased"
DIR_OUTPT = "./results"
DIR_LOG = "./logs"
DIR_TRAINED_MODEL = "./models/final_replication"
SIZE_VALIDATION_SET = 0.1
SEED = 7

In [8]:
set_seed(SEED)

## Prepare dataset

### Load data

In [9]:
df_train_validation_test = pd.read_csv('data/issues_tv_fb_18_20.csv')

In [10]:
df_train_validation, df_test = train_test_split(
    df_train_validation_test, test_size=0.1, random_state=SEED
)

In [11]:
issue_cols = [x for x in df_train_validation.columns if 'ISSUE' in x]

In [12]:
df_train_validation['list'] = df_train_validation[issue_cols].values.tolist()

In [13]:
df_test['list'] = df_test[issue_cols].values.tolist()

In [14]:
df_train, df_validation = train_test_split(
    df_train_validation, test_size=0.1, random_state=SEED
)

print("Size of training set:\t", len(df_train))
print("Size of validation set:\t", len(df_validation))
print("Size of test set:\t", len(df_test))

Size of training set:	 22299
Size of validation set:	 2478
Size of test set:	 2754


In [15]:
df_train = df_train.dropna(axis = 0)
df_validation = df_validation.dropna(axis = 0)
df_test = df_test.dropna(axis = 0)

In [16]:
print("Size of training set:\t", len(df_train))
print("Size of validation set:\t", len(df_validation))
print("Size of test set:\t", len(df_test))

Size of training set:	 22297
Size of validation set:	 2478
Size of test set:	 2752


### Tokenize dataset

In [17]:
# load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(issue_cols))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# preprocess data
field_text = "transcript"
field_label = "list"

dataset_train = Dataset.from_pandas(df_train)
dataset_validation = Dataset.from_pandas(df_validation)
dataset_test = Dataset.from_pandas(df_test)

# tokenize data
train_encodings = tokenizer(dataset_train[field_text], truncation=True, padding=True)
val_encodings = tokenizer(dataset_validation[field_text], truncation=True, padding=True)
test_encodings = tokenizer(dataset_test[field_text], truncation=True, padding=True)

train_dataset = tr.EmotionDataset(train_encodings, dataset_train[field_label])
val_dataset = tr.EmotionDataset(val_encodings, dataset_validation[field_label])
test_dataset = tr.EmotionDataset(test_encodings, dataset_test[field_label])

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

## Train model

In [18]:
training_args = TrainingArguments(
    output_dir=DIR_OUTPT,  # output directory
    num_train_epochs=20,  # total # of training epochs (default is 4)
    per_device_train_batch_size=20,  # batch size per device during training (default 32) -- going from 16 to 25 only reduces training time a little (i.e. 1:35h instead of 1:40h), 32 shaves off another 2 minutes
    per_device_eval_batch_size=20,  # batch size for evaluation (default 32)
    warmup_steps=250,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir=DIR_LOG,  # directory for storing logs
    seed=SEED,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_loss",
    greater_is_better=False,
    run_name=MODEL_NAME,
)

trainer = tr.MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=tr.compute_metrics,
)

_ = trainer.train()
trainer.evaluate()

trainer.model.save_pretrained(f"{DIR_TRAINED_MODEL}/{MODEL_NAME}/")

***** Running training *****
  Num examples = 22297
  Num Epochs = 20
  Instantaneous batch size per device = 20
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 1
  Total optimization steps = 22300


Epoch,Training Loss,Validation Loss,Loss,Accuracy Thresh,Runtime,Samples Per Second,Steps Per Second
1,0.1075,108.169534,0.089367,0.974117,21.5411,115.036,5.756
2,0.0825,89.692394,0.075943,0.97724,21.4562,115.491,5.779
3,0.0684,80.443796,0.074299,0.977848,21.4643,115.447,5.777
4,0.0576,72.007192,0.075387,0.977879,21.5116,115.194,5.764
5,0.0457,66.483899,0.078367,0.978227,21.5274,115.109,5.76
6,0.0368,64.993207,0.083588,0.978121,21.5197,115.151,5.762
7,0.0299,63.38513,0.091132,0.978295,21.5041,115.234,5.766
8,0.024,62.785955,0.098469,0.977904,21.5287,115.102,5.76
9,0.0197,61.203156,0.103204,0.977941,21.5072,115.217,5.766
10,0.0164,59.63805,0.107211,0.977451,21.5053,115.227,5.766


***** Running Evaluation *****
  Num examples = 2478
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1115
Configuration saved in ./results/checkpoint-1115/config.json
Model weights saved in ./results/checkpoint-1115/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2478
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-2230
Configuration saved in ./results/checkpoint-2230/config.json
Model weights saved in ./results/checkpoint-2230/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2478
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-3345
Configuration saved in ./results/checkpoint-3345/config.json
Model weights saved in ./results/checkpoint-3345/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2478
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-4460
Configuration saved in ./results/checkpoint-4460/config.json
Model weights saved in ./results/checkpoint-4460/pytorch_mo

Configuration saved in ./models/final_replication/distilbert-base-uncased/config.json
Model weights saved in ./models/final_replication/distilbert-base-uncased/pytorch_model.bin


In [33]:
!mkdir drive/Shared drives/Delta Lab/github/issue_classifier/models/multilabel_trf_v1

In [34]:
!cp ./models/final_replication/distilbert-base-uncased/config.json drive/Shared drives/Delta Lab/github/issue_classifier/models/multilabel_trf_v1/
!cp ./models/final_replication/distilbert-base-uncased/pytorch_model.bin drive/Shared drives/Delta Lab/github/issue_classifier/models/multilabel_trf_v1/

In [39]:
# Also save 10th rather than just last (20th) step
!mkdir drive/Shared drives/Delta Lab/github/issue_classifier/models/multilabel_trf_v1_step10
!cp ./results/checkpoint-11150/config.json drive/Shared drives/Delta Lab/github/issue_classifier/models/multilabel_trf_v1_step10/
!cp ./results/checkpoint-11150/pytorch_model.bin drive/Shared drives/Delta Lab/github/issue_classifier/models/multilabel_trf_v1_step10/

In [21]:
!nvidia-smi

Thu Aug 11 23:17:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    54W / 250W |   9287MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Evaluate model on test set

In [22]:
results_all = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 2752
  Batch size = 20


### Complete test set

In [23]:
data = dict({"issue": issue_cols})
to_add = {
    "Recall": tr.compute_fine_metrics2(results_all, issue_cols)["recall"],
    "Precision": tr.compute_fine_metrics2(results_all, issue_cols)["precision"],
    "F1": tr.compute_fine_metrics2(results_all, issue_cols)["f1"],
}
df = pd.DataFrame.from_dict(dict(data, **to_add))

In [24]:
df

Unnamed: 0,issue,Recall,Precision,F1
0,ISSUE10,0.725,0.829,0.773
1,ISSUE11,0.500,0.720,0.590
2,ISSUE12,0.303,0.597,0.402
3,ISSUE13,0.189,0.269,0.222
4,ISSUE14,0.451,0.622,0.523
...,...,...,...,...
60,ISSUE208,0.716,0.710,0.713
61,ISSUE210,0.440,0.500,0.468
62,ISSUE212,0.458,0.733,0.564
63,ISSUE218,0.148,0.235,0.182


In [25]:
df['F1'].mean()

0.5228737956439109

In [26]:
df.to_csv('performance/performance_multilabel_trf_v1.csv')

In [27]:
predictions = results_all.predictions
predictions = torch.tensor(predictions)
preds_full = torch.sigmoid(predictions).cpu().detach().numpy().tolist()

In [28]:
preds_bin = np.array(preds_full) >= 0.5

In [29]:
df_prds = pd.DataFrame(preds_bin)

In [30]:
df_prds.columns = df_train.columns[2:-1]

In [31]:
df_prds.to_csv("data/test_set_prds_multilabel_trf_v1.csv")

In [32]:
# Preserve the test set so we can check later
df_test.to_csv('data/test_set_multilabel_trf_v1.csv')