# SPAM or HAM
---

Here I made a model for classifying whether the context given is spam or ham.

The model used here is a trnsformer model from HuggingFace. The model will be used from the 'transformer' library.

Here after loading the model from HuggingFace I did some fine tuning.

---
The Dataset used here is taken from Kaggle and used google colab for writing code.

The dataset originally contains 8000 rows but as it keeps crashing due to RAM limitation in colab I only selected 2000 rows.

---
I tried to use TFTrainer module which can help the model support tensorflow but as for some reason I couldn't import the package, So I ran it as it is.


In [None]:
#!pip install transformers

###Import the dataset and preprocess the the data

In [None]:
import tensorflow as tf
import pandas as pd

In [None]:
df=pd.read_csv('/content/combined_data.csv',nrows=2000)

In [None]:
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [None]:
df.shape

(2000, 2)

In [None]:
df.isna().sum()

label    0
text     0
dtype: int64

In [None]:
# make both dependent and independent column a list object
X=list(df['text'])
y=list(df['label'])

In [None]:
# split the datain to training and testing dataset
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# load the Tokenizer and model from HuggingFace

from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_path='lxyuan/distilbert-base-multilingual-cased-sentiments-student'
tokenizer=AutoTokenizer.from_pretrained(model_path)
model=AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
model

In [None]:
model=model.to('cuda')

In [None]:
X_train_encoded=tokenizer(X_train,padding=True,truncation=True,max_length=256)
X_test_encoded=tokenizer(X_test,padding=True,truncation=True,max_length=256)

In [None]:
# convert the dataset so it can be fine tuned with the model

import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset=Dataset(X_train_encoded,y_train)
test_dataset=Dataset(X_test_encoded,y_test)

### Model creation and training

In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [None]:
pip install transformers[torch]


In [None]:
!pip install --upgrade accelerate

In [None]:
# set args parameter value for Trainer instance
args = TrainingArguments(
    output_dir='./result',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [None]:
# Create the tTrainer instance
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    #compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss
10,1.5848
20,1.4473
30,1.2536
40,1.0994
50,0.9123
60,0.7668
70,0.7285
80,0.6613
90,0.6368
100,0.5587


TrainOutput(global_step=100, training_loss=0.9649577569961548, metrics={'train_runtime': 80.2094, 'train_samples_per_second': 39.896, 'train_steps_per_second': 1.247, 'total_flos': 211951617638400.0, 'train_loss': 0.9649577569961548, 'epoch': 2.0})

### Evaluation and prediction

In [None]:
trainer.evaluate()

{'eval_loss': 0.5175684094429016,
 'eval_runtime': 3.1794,
 'eval_samples_per_second': 125.809,
 'eval_steps_per_second': 15.726,
 'epoch': 2.0}

In [None]:
# predict the testing data
prediction=trainer.predict(test_dataset)[1]

In [None]:
prediction

array([1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,

In [None]:
from sklearn.metrics import confusion_matrix

cf_mat=confusion_matrix(y_test,prediction)

In [None]:
cf_mat

array([[175,   0],
       [  0, 225]])

###Saving the model

In [None]:
trainer.save_model('classifier')

In [None]:
model=trainer.model
model.save_pretrained("my_sentiment_model")


In [None]:
# for loading the model
'''
from transformers import TFAutoModelForSequenceClassification

# Load the saved model
loaded_model = TFAutoModelForSequenceClassification.from_pretrained("my_sentiment_model")
'''

'\nfrom transformers import TFAutoModelForSequenceClassification\n\n# Load the saved model\nloaded_model = TFAutoModelForSequenceClassification.from_pretrained("my_sentiment_model")\n'