<a href="https://colab.research.google.com/github/Sowmyad15/SMS_Spam/blob/main/BERT_MLP_Balanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel


In [None]:
df=pd.read_csv('/content/spam.csv',encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:

df = df.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df.columns = ["label", "message"]
df['label_num']=df['label'].map({
    'ham':0,
    'spam':1
})
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:

df_spam = df[df['label']=='spam']

df_ham = df[df['label']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 3)
Spam Dataset Shape: (747, 3)


# **Balanced Data- BERT+MLP**

In [None]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 3)

In [None]:

df= pd.concat([df_spam , df_ham_downsampled])

In [None]:
df['label_num'].value_counts()

1    747
0    747
Name: label_num, dtype: int64

In [None]:

import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)
df['preprocessed_txt'] = df['message'].apply(preprocess)
df.head()

Unnamed: 0,label,message,label_num,preprocessed_txt
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s..."


In [None]:

import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def get_bert_embeddings(texts):
    texts = texts.tolist() if isinstance(texts, pd.Series) else texts
    input_ids = []
    attention_masks = []

    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            max_length=128,
            truncation=True,
            add_special_tokens=True,
            return_tensors='pt',
            pad_to_max_length=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    with torch.no_grad():
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        outputs = model(input_ids, attention_mask=attention_masks)
        pooled_output = outputs.pooler_output

    return pooled_output



In [None]:
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [None]:
bert_embeddings = get_bert_embeddings(df['preprocessed_txt'])



In [None]:
bert_embeddings[0]

tensor([-6.7605e-01, -4.1760e-01, -7.4975e-01,  4.9545e-01,  3.8311e-01,
        -1.5031e-01,  5.8715e-01,  3.0121e-01, -3.2861e-01, -9.9996e-01,
        -1.6286e-01,  5.3948e-01,  9.5133e-01,  2.2359e-01,  7.0963e-01,
        -3.5806e-01,  3.0650e-01, -4.8774e-01,  3.0125e-01,  2.8047e-01,
         5.6219e-01,  9.9985e-01,  2.0910e-01,  2.8976e-01,  3.9732e-01,
         6.8742e-01, -4.1467e-01,  7.6198e-01,  8.9191e-01,  5.6897e-01,
        -2.4566e-01,  2.9931e-01, -9.6557e-01, -2.4548e-01, -8.0565e-01,
        -9.8079e-01,  2.6141e-01, -5.1607e-01, -2.6123e-02, -3.2012e-02,
        -7.1511e-01,  3.9629e-01,  9.9987e-01, -1.1536e-01,  1.9850e-01,
        -2.3424e-01, -9.9998e-01,  1.3982e-01, -6.9615e-01,  5.3830e-01,
         3.2594e-01,  6.7014e-01,  2.0668e-01,  3.4313e-01,  3.3484e-01,
         2.9076e-01, -1.0411e-01,  3.5477e-02, -2.2749e-01, -5.9219e-01,
        -5.7355e-01,  5.1930e-01, -5.5766e-01, -7.8905e-01,  4.5395e-01,
         5.7246e-01, -2.4104e-01, -3.3214e-01, -2.9

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    bert_embeddings,
    df.label_num,
    test_size=0.2,
    random_state=42
)


In [None]:
X_train_tensor=torch.tensor(X_train,dtype=torch.float32)

  X_train_tensor=torch.tensor(X_train,dtype=torch.float32)


In [None]:
y_train_tensor=torch.tensor(y_train.to_numpy(),dtype=torch.float32)

In [None]:
y_test.value_counts()

0    154
1    145
Name: label_num, dtype: int64

In [None]:
X_test_tensor=torch.tensor(X_test,dtype=torch.float32)


  X_test_tensor=torch.tensor(X_test,dtype=torch.float32)


In [None]:
y_test_tensor=torch.tensor(y_test.to_numpy(),dtype=torch.float32)

In [None]:
print(y_test_tensor)

tensor([0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1.,
        0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
        1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1.,
        0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0.,
        0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0.,
        0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1.,
        1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1.,
        0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1.,
        1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1.,
        1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1.,
        0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0.,
        0., 1., 1., 1., 0., 0., 1., 1., 

In [None]:
input_size = 768
hidden_size = 512
output_size = 1

mlp_model = MLP(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)

In [None]:
criterion.to(device)

BCELoss()

In [None]:
mlp_model.to(device)

MLP(
  (fc1): Linear(in_features=768, out_features=512, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
X_train_tensor = X_train_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

In [None]:
X_test_tensor = X_test_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

In [None]:
epochs = 20
for epoch in range(epochs):
    mlp_model.train()
    optimizer.zero_grad()
    outputs = mlp_model(X_train_tensor).to(device)
    loss = criterion(outputs.squeeze(), y_train_tensor).to(device)
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}')

Epoch 1/20, Loss: 0.6897765398025513
Epoch 2/20, Loss: 0.5824240446090698
Epoch 3/20, Loss: 0.5672877430915833
Epoch 4/20, Loss: 0.5843825936317444
Epoch 5/20, Loss: 0.5243632197380066
Epoch 6/20, Loss: 0.5354791283607483
Epoch 7/20, Loss: 0.5017170906066895
Epoch 8/20, Loss: 0.48158684372901917
Epoch 9/20, Loss: 0.4799061119556427
Epoch 10/20, Loss: 0.44873932003974915
Epoch 11/20, Loss: 0.43854257464408875
Epoch 12/20, Loss: 0.43217411637306213
Epoch 13/20, Loss: 0.4087136387825012
Epoch 14/20, Loss: 0.4019804000854492
Epoch 15/20, Loss: 0.39505672454833984
Epoch 16/20, Loss: 0.37629130482673645
Epoch 17/20, Loss: 0.371151328086853
Epoch 18/20, Loss: 0.36277079582214355
Epoch 19/20, Loss: 0.3476073741912842
Epoch 20/20, Loss: 0.3431721329689026


In [None]:
 y_test_numpy = y_test_tensor.cpu().numpy()

In [None]:
mlp_model.eval()
with torch.no_grad():
    test_outputs = mlp_model(X_test_tensor)
    predictions = (test_outputs.squeeze() >= 0.5).cpu().float()
    accuracy = accuracy_score(y_test_numpy, predictions)
    print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 86.96%
