In [5]:
# Import packages
import torch
import transformers
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [4]:
# !pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp38-cp38-win_amd64.whl (8.3 MB)
     ---------------------------------------- 8.3/8.3 MB 4.3 MB/s eta 0:00:00
Collecting scipy>=1.3.2
  Downloading scipy-1.10.0-cp38-cp38-win_amd64.whl (42.2 MB)
     ---------------------------------------- 42.2/42.2 MB 4.2 MB/s eta 0:00:00
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     -------------------------------------- 298.0/298.0 kB 4.6 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.1 scipy-1.10.0 threadpoolctl-3.1.0


In [2]:
# !pip install pandas

Collecting pandas
  Downloading pandas-1.5.3-cp38-cp38-win_amd64.whl (11.0 MB)
     --------------------------------------- 11.0/11.0 MB 11.3 MB/s eta 0:00:00
Installing collected packages: pandas
Successfully installed pandas-1.5.3


In [6]:
# read the data
train = pd.read_csv(r"..\archive\SMS_train.csv", encoding='latin')
test = pd.read_csv(r"..\archive\SMS_test.csv", encoding='latin')

In [7]:
train

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam
...,...,...,...
952,953,hows my favourite person today? r u workin har...,Non-Spam
953,954,How much you got for cleaning,Non-Spam
954,955,Sorry da. I gone mad so many pending works wha...,Non-Spam
955,956,Wat time ü finish?,Non-Spam


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   S. No.        957 non-null    int64 
 1   Message_body  957 non-null    object
 2   Label         957 non-null    object
dtypes: int64(1), object(2)
memory usage: 22.6+ KB


In [9]:
train.Message_body = train.Message_body.astype('str')

In [10]:
# choosing a pretrained bert model
model_name = "distilbert-base-uncased"
max_length = 512

In [11]:
# get the unique labe names
label_names = list(set(train.Label))
label_names

['Spam', 'Non-Spam']

In [12]:
# label encoding the label names
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(label_names)

array([1, 0], dtype=int64)

In [13]:
# saving label encoder to regenerate
np.save('classes.npy', le.classes_)
le.classes_ = numpy.load('classes.npy')

NameError: name 'encoder' is not defined

In [10]:
train_label = le.transform(train.Label)

In [11]:
# train and validation split
train_X, val_X, train_y, val_y = train_test_split(train.Message_body, train_label, test_size=0.2, random_state=1234)

In [12]:
train_X

701             I'm really sorry I lit your hair on fire
362          I will reach ur home in  &lt;#&gt;  minutes
847    Aiyo... Her lesson so early... I'm still sleep...
330          Jay wants to work out first, how's 4 sound?
378    Our records indicate u maybe entitled to 5000 ...
                             ...                        
204    A £400 XMAS REWARD IS WAITING FOR YOU! Our com...
53     Miles and smiles r made frm same letters but d...
294                        Oh yeah clearly it's my fault
723                I need details about that online job.
815    Your weekly Cool-Mob tones are ready to downlo...
Name: Message_body, Length: 765, dtype: object

In [13]:
train_y

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [14]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [15]:
# trim if length greatere than max_length and pad with 0's when less than `max_length`
train_encodings = tokenizer(list(train_X), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(list(val_X), truncation=True, padding=True, max_length=max_length)

In [16]:
# make the encoding as torch dataset
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]]).type(torch.LongTensor)
        return item

    def __len__(self):
        return len(self.labels)

# convert tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_y)
val_dataset = NewsGroupsDataset(val_encodings, val_y)

In [17]:
# making our model
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_names)).to("cuda")

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [18]:
# set traing arguments
training_args = TrainingArguments(
    output_dir='../results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=50,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='../logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=100,               # log & save weights each logging_steps
    save_steps=100,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [19]:
# make the metrics function
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

In [20]:
# make the trainer instance
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [21]:
# train
trainer.train()

***** Running training *****
  Num examples = 765
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 288


Step,Training Loss,Validation Loss,Accuracy
100,0.2176,0.002232,1.0
200,0.0495,0.002388,1.0


***** Running Evaluation *****
  Num examples = 192
  Batch size = 16
Saving model checkpoint to ../results\checkpoint-100
Configuration saved in ../results\checkpoint-100\config.json
Model weights saved in ../results\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 192
  Batch size = 16
Saving model checkpoint to ../results\checkpoint-200
Configuration saved in ../results\checkpoint-200\config.json
Model weights saved in ../results\checkpoint-200\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ../results\checkpoint-100 (score: 0.002231789054349065).


TrainOutput(global_step=288, training_loss=0.09703396219346258, metrics={'train_runtime': 90.0492, 'train_samples_per_second': 25.486, 'train_steps_per_second': 3.198, 'total_flos': 83128467164400.0, 'train_loss': 0.09703396219346258, 'epoch': 3.0})

In [22]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 192
  Batch size = 16


{'eval_loss': 0.002231789054349065,
 'eval_accuracy': 1.0,
 'eval_runtime': 1.1175,
 'eval_samples_per_second': 171.814,
 'eval_steps_per_second': 10.738,
 'epoch': 3.0}

In [23]:
# saving the fine tuned model & tokenizer
model_path = "..\\Classifier_distilBert"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in ..\Classifier_distilBert\config.json
Model weights saved in ..\Classifier_distilBert\pytorch_model.bin
tokenizer config file saved in ..\Classifier_distilBert\tokenizer_config.json
Special tokens file saved in ..\Classifier_distilBert\special_tokens_map.json


('..\\Classifier_distilBert\\tokenizer_config.json',
 '..\\Classifier_distilBert\\special_tokens_map.json',
 '..\\Classifier_distilBert\\vocab.txt',
 '..\\Classifier_distilBert\\added_tokens.json',
 '..\\Classifier_distilBert\\tokenizer.json')