<a href="https://colab.research.google.com/github/Xelvise/NLP-compilation-with-HuggingFace/blob/main/Fine_tuning_bert_for_SMS_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers accelerate datasets --quiet

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/SMSSpamCollection.txt', sep='\t', names=['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
# perform one-hot encoding for the label
df.label = np.where(df.label=='spam',1,0)
df.label.value_counts()

label
0    4825
1     747
Name: count, dtype: int64

In [None]:
# setting up dataset for supervised learning...
# since the target class is imbalanced, we oversample the lower class
df_nonspam = df[df.label==0]
df_spam = df[df.label==1]

df_spam = df_spam.sample(4825, replace=True)
df = pd.concat([df_spam,df_nonspam], axis=0)

df.label.value_counts()

label
1    4825
0    4825
Name: count, dtype: int64

In [None]:
x = list(df.message)
y = list(df.label)

# train-test split
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.25,random_state=0)

In [None]:
from datasets import DatasetDict, Dataset, Features, Value

# Define the features of the dataset
features = Features({'text': Value('string'), 'label': Value('int32')})

# Create datasets from data
train_dataset = Dataset.from_dict({'text': xtrain, 'label': ytrain}, features=features)
test_dataset = Dataset.from_dict({'text': xtest, 'label': ytest}, features=features)

# Create a DatasetDict
data = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7237
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2413
    })
})

In [None]:
# initialize the tokenizer and pretrained model

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

- Tokenize entire text...

In [None]:
# define tokenization method

def tokenize(batch):    # for every data batch (like train, test, validation), this function tokenizes each
    return tokenizer(batch['text'], padding=True, truncation=True)     # batch_size defaults to length of longest sequence in a batch

In [None]:
encoded_data = data.map(tokenize, batched=True, batch_size=None)     # batched=True allows for parallel tokenization of all the batches
encoded_data

Map:   0%|          | 0/7237 [00:00<?, ? examples/s]

Map:   0%|          | 0/2413 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 7237
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2413
    })
})

In [None]:
import torch
from transformers import AutoModel

base_model = AutoModel.from_pretrained(checkpoint)
base_model



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

##### Fine-Tuning DistilBERT on data (by attaching a classification head model to pretrained model's hidden_state)

##### Instead of AutoModel, we use AutoModelForSequenceClassification model as it has a classification head on top of the pretrained model outputs and can be easily trained with the base model.

In [None]:
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")    # Use GPU if present, else use CPU

# Initialize the classication head adding the expected number of labels
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# SequenceClassification is mainly for sentiment analysis or text classification tasks
# define hyperparameters for fine-tuning

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/finetuned-tweet-classifier",
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    disable_tqdm=False
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
# For every epoch, evaluation is done in which actual and predicted labels is computed

def compute_metrics(pred):
    labels = pred.label_ids     # actual labels
    preds = pred.predictions.argmax(-1)      # predicted label
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy":acc, "f1":f1}

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=encoded_data['train'],
    eval_dataset=encoded_data['test'],
    tokenizer=tokenizer
)

trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.034668,0.987982,0.98798


In [None]:
pred_output = trainer.predict(encoded_tweet['test'])
pred_output     #  running inference on test_set (output in logits)

NameError: name 'trainer' is not defined

In [None]:
# import transformers as trans
# trans.__version__

In [None]:
# !pip install datasets