In [1]:
import pandas as pd
import numpy as np
import transformers
from datasets import Dataset, DatasetDict

In [2]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
train_dataset =Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [5]:
new_dataset = train_dataset.train_test_split(test_size = 0.2)

In [6]:
from transformers import AutoTokenizer


model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
new = new_dataset.rename_columns({"target":"label"})

In [8]:
def input_ids(batch):
    return {"input_ids":tokenizer.encode(batch["text"])}
    

In [9]:
new = new.map(input_ids)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSequenceClassification

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(output_dir=model_name,
 num_train_epochs=3,
 learning_rate=2e-5,
 per_device_train_batch_size=32,
 per_device_eval_batch_size=32,
 weight_decay=0.01,
 evaluation_strategy="epoch",
 disable_tqdm=False,
 push_to_hub=False,
 log_level="error",
 report_to = "none")



In [13]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


In [14]:
trainer = Trainer(model=model, args=training_args,
 compute_metrics=compute_metrics,
 train_dataset=new["train"],
 eval_dataset=new["test"],
 tokenizer=tokenizer)
trainer.train();




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.392675,0.83979,0.839483
2,No log,0.391341,0.842416,0.841906
3,No log,0.404911,0.838477,0.83844




In [14]:
test_new = test_dataset.map(input_ids)
def predict(batch):
    tensor = torch.tensor(batch["input_ids"]).to(device)
    tensor = tensor.unsqueeze(0)
    outputs = model(tensor)
    
    return {"label": outputs.logits.argmax(-1).to(device)}
test_new = test_new.map(predict)
dicts = {"id": test["id"].values,"target":np.array(test_new["label"])[:,0]}
predictions = pd.DataFrame(dicts)
predictions.to_csv("submission.csv", index = False)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [15]:
class DistillTrainingArguments(TrainingArguments):
    def __init__(self, alpha = 0.4, T = 1.7, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.T = T


In [16]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

class DistillTrainer(Trainer):
    def __init__(self, teacher_model = None, *args, **kargs):
        super().__init__(*args, **kargs)
        self.teacher_model = teacher_model
    def compute_loss(self, model, inputs, return_outputs = False):
        outputs_stu = model(**inputs)
        # Extract cross-entropy loss and logits from student
        loss_ce = outputs_stu.loss
        logits_stu = outputs_stu.logits
        # Extract logits from teacher
        with torch.no_grad():
            outputs_tea = self.teacher_model(**inputs)
            logits_tea = outputs_tea.logits
        # Soften probabilities and compute distillation loss
        loss_fct = nn.KLDivLoss(reduction="batchmean")
        loss_kd = self.args.T ** 2 * loss_fct(
        F.log_softmax(logits_stu / self.args.T,
        dim=-1),
        F.softmax(logits_tea / self.args.T, dim=-1))
        # Return weighted student loss
        loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
        
        return (loss, outputs_stu) if return_outputs else loss

  

In [22]:
student_ckpt = "distilbert-base-uncased"


In [17]:
def model_init():
    return  AutoModelForSequenceClassification.from_pretrained(student_ckpt, num_labels = 2).to(device)

In [23]:
from bayes_opt import BayesianOptimization



def black_box_function(alpha, T):
    student_training_args = DistillTrainingArguments(
         output_dir="new", evaluation_strategy = "epoch",
         num_train_epochs=5, learning_rate=2e-5,
         per_device_train_batch_size=32,
         per_device_eval_batch_size=32,
         weight_decay=0.01,
         push_to_hub=False,
        disable_tqdm=False,
        alpha = alpha,
        T = T,
         log_level="error",
         report_to = "none")

    distilbert_trainer = DistillTrainer(model_init=model_init,
        teacher_model=model, args=student_training_args,
        train_dataset=new['train'],
        eval_dataset=new['test'],
        compute_metrics=compute_metrics, 
        tokenizer=tokenizer)

    distilbert_trainer.train()
    return distilbert_trainer.evaluate()["eval_accuracy"]

In [24]:
pbounds = {'alpha': (0.01, 0.9), 'T': (1, 3)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=21,
)

In [25]:
optimizer.maximize(
    init_points=2,
    n_iter=4,
)

|   iter    |  target   |     T     |   alpha   |
-------------------------------------------------




config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.164032,0.814183,0.815408
2,No log,0.129942,0.833224,0.833503
3,No log,0.12554,0.83979,0.839524
4,No log,0.125678,0.84176,0.841396
5,No log,0.129697,0.838477,0.838617




| [30m1         | [30m0.8385    | [30m1.097     | [30m0.2673    |




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.09679,0.814839,0.816
2,No log,0.059633,0.831254,0.831874
3,No log,0.045829,0.840446,0.8405
4,No log,0.044085,0.843729,0.843585
5,No log,0.045776,0.835194,0.835388




| [30m2         | [30m0.8352    | [30m2.442     | [30m0.02924   |




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.259629,0.812869,0.814036
2,No log,0.219488,0.83585,0.835993
3,No log,0.213147,0.848326,0.847642
4,No log,0.218762,0.83979,0.839524
5,No log,0.225204,0.834537,0.834537




| [30m3         | [30m0.8345    | [30m1.828     | [30m0.4935    |




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.165043,0.814839,0.816055
2,No log,0.131379,0.83388,0.834175
3,No log,0.126987,0.840446,0.840161
4,No log,0.127169,0.83979,0.839483
5,No log,0.131195,0.838477,0.838617




| [30m4         | [30m0.8385    | [30m1.096     | [30m0.2712    |




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.372831,0.824032,0.82458
2,No log,0.350039,0.842416,0.841949
3,No log,0.357976,0.84176,0.841047
4,No log,0.392122,0.829941,0.82996
5,No log,0.397029,0.834537,0.834384




| [30m5         | [30m0.8345    | [30m1.0       | [30m0.9       |




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.378634,0.822062,0.822659
2,No log,0.357024,0.840446,0.840161
3,No log,0.359157,0.84176,0.841047
4,No log,0.390769,0.830598,0.830559
5,No log,0.395663,0.835194,0.835099




| [30m6         | [30m0.8352    | [30m3.0       | [30m0.9       |


In [26]:
print(optimizer.max)

{'target': 0.8384766907419566, 'params': {'T': 1.0974497616182546, 'alpha': 0.267307597212937}}


In [28]:
student_training_args_optimized = DistillTrainingArguments(
         output_dir="new", evaluation_strategy = "epoch",
         num_train_epochs=5, learning_rate=2e-5,
         per_device_train_batch_size=32,
         per_device_eval_batch_size=32,
         weight_decay=0.01,
         push_to_hub=False,
        disable_tqdm=False,
        alpha = 0.26730759,
        T = 1.09744,
         log_level="error",
         report_to = "none")




In [30]:
student_model = AutoModelForSequenceClassification.from_pretrained(student_ckpt, num_labels = 2).to(device)

In [31]:
distilbert_trainer_optimized = DistillTrainer(model=student_model,
 teacher_model=model, args=student_training_args_optimized,
 train_dataset=new['train'],
eval_dataset=new['test'],
 compute_metrics=compute_metrics, 
 tokenizer=tokenizer)

distilbert_trainer_optimized.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.153502,0.819435,0.82042
2,No log,0.131333,0.834537,0.834908
3,No log,0.12562,0.844386,0.843774
4,No log,0.1254,0.845043,0.844625
5,No log,0.12981,0.839133,0.839041




TrainOutput(global_step=480, training_loss=0.13355814615885417, metrics={'train_runtime': 255.438, 'train_samples_per_second': 119.207, 'train_steps_per_second': 1.879, 'total_flos': 480563262531288.0, 'train_loss': 0.13355814615885417, 'epoch': 5.0})

In [32]:
test_new = test_dataset.map(input_ids)

def predict(batch):
    tensor = torch.tensor(batch["input_ids"]).to(device)
    tensor = tensor.unsqueeze(0)
    outputs = student_model(tensor)
    
    return {"label": outputs.logits.argmax(-1).to(device)}

test_new = test_new.map(predict)
dicts = {"id": test["id"].values,"target":np.array(test_new["label"])[:,0]}
predictions = pd.DataFrame(dicts)
predictions.to_csv("distillsubmission.csv", index = False)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [34]:
predictions.to_csv("distillsubmission.csv", index = False)