## Install library we need

In [4]:
!pip install transformers datasets evaluate huggingface_hub
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m

Collecting accelerate
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.33.0
    Uninstalling accelerate-0.33.0:
      Successfully uninstalled accelerate-0.33.0
Successfully installed accelerate-0.34.2


## Connect to google drive for read the dataset

In [1]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


## Read the dataset

In [2]:
import pandas as pd
df = pd.read_csv("/content/gdrive/MyDrive/SMS Classifier/spam.csv", encoding='ISO-8859-1')
df = df[['v1', 'v2']]

df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
mapping_category = {
    'ham' : 0,
    'spam' : 1
}

df['v1'] = df['v1'].apply(lambda x: mapping_category.get(x, -1))

In [4]:
## clean data

import string
import nltk

# make sure to download  nltk punkt
nltk.download('punkt')

# remove duplicate data
df = df.drop_duplicates(keep='first')

# Function to remove punuacttion
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to the 'text' column
#to remove punctuation (!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`.)
df['v2'] = df['v2'].apply(remove_punctuation)

# lowercase
df['v2'] = df['v2'].str.lower()

# Display the DataFrame
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['v2'] = df['v2'].apply(remove_punctuation)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['v2'] = df['v2'].str.lower()


Unnamed: 0,v1,v2
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [5]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np

X = df['v2']
y = df['v1']

# Split our data to train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create Dataframe
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# actually is optional, just for get how many data we use
print(f"Training set shape: {train_df.shape}")
print(f"Validation set shape: {test_df.shape}")

# convert our pandas dataframe to transformers dataframe
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

Training set shape: (4135, 2)
Validation set shape: (1034, 2)


In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

# Load the pre-trained BERT model and tokenizer
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize our feature data with tokenizer from BERT
def preprocess_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

bert_train = train_dataset.map(lambda x: preprocess_function(x, bert_tokenizer), batched=True)
bert_test = test_dataset.map(lambda x: preprocess_function(x, bert_tokenizer), batched=True)

# Define metrics that you want model to show it on evaluation process
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4135 [00:00<?, ? examples/s]

Map:   0%|          | 0/1034 [00:00<?, ? examples/s]

In [8]:
# To know how many parameters in BERT.
total_params = sum(p.numel() for p in bert_model.parameters())
print(f"Total parameters: {total_params}")

# To know how many parameters in BERT that will be trained.
trainable_params = sum(p.numel() for p in bert_model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {trainable_params}")

Total parameters: 109483778
Total trainable parameters: 109483778


In [None]:
# for getting model architecture
for name, module in bert_model.named_modules():
    print(name)

In [None]:
def train_model(model, tokenizer, train_dataset, val_dataset, model_name):
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=10,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    return trainer

In [None]:
bert_trainer = train_model(bert_model, bert_tokenizer, bert_train, bert_test, "bert_full")



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0643,0.061854,0.989362,0.989273
2,0.0359,0.105906,0.985493,0.985112
3,0.0127,0.072785,0.990329,0.990231
4,0.0052,0.168906,0.985493,0.985112
5,0.0086,0.087373,0.991296,0.991193
6,0.0,0.072772,0.992263,0.992158
7,0.0055,0.070194,0.99323,0.99315
8,0.0027,0.067029,0.992263,0.992185
9,0.0052,0.061384,0.99323,0.993174
10,0.0025,0.068305,0.992263,0.992212


In [None]:
bert_results = bert_trainer.evaluate(bert_test)
print("BERT Test Results:", bert_results)

BERT Test Results: {'eval_loss': 0.061384156346321106, 'eval_accuracy': 0.9932301740812379, 'eval_f1': 0.9931736651685741, 'eval_runtime': 30.0261, 'eval_samples_per_second': 34.437, 'eval_steps_per_second': 4.33, 'epoch': 10.0}


In [None]:
bert_trainer.save_model("./fine_tuned_bert_full")
bert_tokenizer.save_pretrained("./fine_tuned_bert_full")

('./fine_tuned_bert_full/tokenizer_config.json',
 './fine_tuned_bert_full/special_tokens_map.json',
 './fine_tuned_bert_full/vocab.txt',
 './fine_tuned_bert_full/added_tokens.json',
 './fine_tuned_bert_full/tokenizer.json')

In [None]:
import shutil
import os

# copy the model and tokenizer from colab to our drive
source_path = './fine_tuned_bert_full'
destination_path = '/content/gdrive/MyDrive/SMS Classifier/Fine-tuning_BERT'

# create the destination_path folder if it's not exist
if not os.path.exists(destination_path):
    os.makedirs(destination_path)

# copy and paste all the file
for item in os.listdir(source_path):
    s = os.path.join(source_path, item)
    d = os.path.join(destination_path, item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)

print("all file already in Google Drive")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load model
model_path = '/content/gdrive/MyDrive/SMS Classifier/Fine-tuning_BERT'
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

def predict_spam(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    prediction = torch.argmax(probabilities, dim=-1).item()

    return "Spam" if prediction == 1 else "Not Spam", probabilities[0][prediction].item()

# Example data for testing
test_texts = [
    "Congrats for your trophy bro",
    "do you want to get a beer bro",
    "you win 20000 Euro, to get that click link below"
]

for text in test_texts:
    label, confidence = predict_spam(text)
    print(f"Text: {text}")
    print(f"Prediction : {label}")
    print(f"Confidence Score : {confidence:.2f}")
    print("---")

Text: Congrats for your trophy bro
Prediction : Not Spam
Confidence Score : 1.00
---
Text: do you want to get a beer bro
Prediction : Not Spam
Confidence Score : 1.00
---
Text: you win 20000 Euro, to get that click link below
Prediction : Spam
Confidence Score : 1.00
---
