In [None]:
!pip install datasets
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
from datasets import load_metric, Dataset
from sklearn.model_selection import train_test_split
from google.colab import drive
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, AutoModelForSequenceClassification, AutoTokenizer
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Load model and tokenizer

In [None]:
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

# Load and preprocess Data

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/data/SARC_filtered_40K.csv"
data = pd.read_csv(path, encoding='utf-8').dropna()
df = pd.concat([data["comment"],data["label"]], axis = 1)
ds = Dataset.from_pandas(df)

In [None]:
def tokenize(examples):
    outputs = tokenizer(examples['comment'], truncation=True)
    return outputs

tokenized_ds = ds.map(tokenize, batched=True)
split_tokenized_ds = tokenized_ds.train_test_split(test_size=0.2)

  0%|          | 0/37 [00:00<?, ?ba/s]

# Prepare Trainer

In [None]:
def compute_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(num_train_epochs=4,
                                  output_dir="distilbert_SARC",
                                  per_device_train_batch_size=64,
                                  per_device_eval_batch_size=64,
                                  save_strategy="epoch",
                                  evaluation_strategy ='epoch',
                                  load_best_model_at_end=True,)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
trainer = Trainer(model=model, tokenizer=tokenizer,
                  data_collator=data_collator,
                  args=training_args,
                  train_dataset=split_tokenized_ds["train"],
                  eval_dataset=split_tokenized_ds["test"], 
                  compute_metrics=compute_metrics)

# Train Model

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, comment. If __index_level_0__, comment are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 29358
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1377


Step,Training Loss,Validation Loss,Accuracy
250,No log,0.482258,0.77248
500,0.507900,0.498521,0.78297
750,0.507900,0.474416,0.786512
1000,0.373900,0.535792,0.783924
1250,0.373900,0.5355,0.78188


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, comment. If __index_level_0__, comment are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7340
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, comment. If __index_level_0__, comment are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7340
  Batch size = 64
Saving model checkpoint to distilbert_SARC/checkpoint-500
Configuration saved in distilbert_SARC/checkpoint-500/config.json
Model weights saved in distilbert_SARC/checkpoint-500/pytorch_model.bin
tokenizer config file saved in d

TrainOutput(global_step=1377, training_loss=0.3920303488781942, metrics={'train_runtime': 379.3938, 'train_samples_per_second': 232.144, 'train_steps_per_second': 3.629, 'total_flos': 1238583113703000.0, 'train_loss': 0.3920303488781942, 'epoch': 3.0})

#Save model and tokenizer

In [None]:
model_save_name = 'Distilbert_SARC'
model.save_pretrained(F"/content/drive/MyDrive/Colab Notebooks/saved_models/{model_save_name}")
tokenizer.save_pretrained(F"/content/drive/MyDrive/Colab Notebooks/saved_models/{model_save_name}")

Configuration saved in /content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/special_tokens_map.json


('/content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/saved_models/Distilbert_SARC/tokenizer.json')