In [87]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel

In [88]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Data Load

In [89]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [90]:
df.Category=df.Category.map({'ham':0, 'spam':1})
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [91]:
df.Category.value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [92]:
df_spam = df[df.Category==1]
df_spam.shape

(747, 2)

In [93]:
df_ham = df[df.Category==0].sample(1000)
df_ham.shape

(1000, 2)

In [94]:
df_new = pd.concat([df_spam, df_ham])
df_new.Category.value_counts()

Category
0    1000
1     747
Name: count, dtype: int64

## Train_Test Spilit

In [95]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(df_new.Message, df_new.Category, test_size=0.2, random_state=5)

In [96]:
y_train.value_counts()

Category
0    789
1    608
Name: count, dtype: int64

In [97]:
y_test.value_counts()

Category
0    211
1    139
Name: count, dtype: int64

In [98]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenizer_fun(texts, labels):
    encodings = tokenizer(texts, padding = 'max_length', max_length=128, truncation=True, return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels, dtype=torch.float)

tokenizer_fun(["You win $100000", "Good Morning"], [1,0])

(tensor([[ 101, 2017, 2663, 1002, 6694, 8889,  102,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0],
         [ 101, 2204, 2851,  102,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,

In [99]:
print(f"X:train\n {x_train.head(5)}\n")
print(f"X:train\n {y_train.head(5)}")

X:train
 3642    You can stop further club tones by replying "S...
2011    Dunno lei... I thk mum lazy to go out... I nev...
1155    Did u find a sitter for kaitlyn? I was sick an...
19      England v Macedonia - dont miss the goals/team...
3208            This phone has the weirdest auto correct.
Name: Message, dtype: object

X:train
 3642    1
2011    0
1155    0
19      1
3208    0
Name: Category, dtype: int64


In [100]:
# Convert DataFrame to list
x_train.values.tolist()[:2]# Get the first two rows  

['You can stop further club tones by replying "STOP MIX" See my-tone.com/enjoy. html for terms. Club tones cost GBP4.50/week. MFL, PO Box 1146 MK45 2WT (2/3)',
 'Dunno lei... I thk mum lazy to go out... I neva ask her yet...']

In [101]:
train_input_id, train_attention_mask, train_labels = tokenizer_fun(x_train.values.tolist(), y_train.values.tolist())
infer_input_id, infer_attention_mask, infer_labels = tokenizer_fun(x_test.values.tolist(), y_test.values.tolist())

## TensorDataset & DataLoader

In [102]:
from torch.utils.data import TensorDataset
# Create TensorDataset
train_dataset = TensorDataset(train_input_id, train_attention_mask, train_labels)
test_dataset = TensorDataset(infer_input_id, infer_attention_mask, infer_labels)

# Wrap with DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [103]:
bert = BertModel.from_pretrained('bert-base-uncased')
hidden_size = bert.config.hidden_size

## Define Class and Parameters

In [104]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = bert

        for para in self.bert.parameters():
            para.requires_grad = False #freeze all BERT layers
    
        self.classifier = nn.Sequential(
            nn.Linear(bert.config.hidden_size,256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256,1),
            nn.Sigmoid()
        )
    def forward(self, input_id, attention_mask):
        bert_output = self.bert(input_ids = input_id, attention_mask = attention_mask)
        sentence_embedding = bert_output.last_hidden_state[:,0,:]
        return self.classifier(sentence_embedding)

In [105]:
model = SentimentClassifier()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

model = model.to(device)
criterion = criterion.to(device)

# Training Part

In [106]:
epochs = 3
# Traning Loops

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch, (input_ids, attention_mask,lables) in enumerate(train_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        lables = lables.to(device, dtype = torch.float)
                

        #Backward and optimize
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze()
        loss= criterion(outputs, lables)
        loss.backward()
        optimizer.step()

        print(f"Batch: {batch}, Epoch: {epoch}, Loss:  {loss.item():0.2f}")
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}')

Batch: 0, Epoch: 0, Loss:  0.69
Batch: 1, Epoch: 0, Loss:  0.65
Batch: 2, Epoch: 0, Loss:  0.58
Batch: 3, Epoch: 0, Loss:  0.52
Batch: 4, Epoch: 0, Loss:  0.48
Batch: 5, Epoch: 0, Loss:  0.45
Batch: 6, Epoch: 0, Loss:  0.38
Batch: 7, Epoch: 0, Loss:  0.36
Batch: 8, Epoch: 0, Loss:  0.36
Batch: 9, Epoch: 0, Loss:  0.31
Batch: 10, Epoch: 0, Loss:  0.28
Batch: 11, Epoch: 0, Loss:  0.33
Batch: 12, Epoch: 0, Loss:  0.22
Batch: 13, Epoch: 0, Loss:  0.23
Batch: 14, Epoch: 0, Loss:  0.24
Batch: 15, Epoch: 0, Loss:  0.14
Batch: 16, Epoch: 0, Loss:  0.18
Batch: 17, Epoch: 0, Loss:  0.16
Batch: 18, Epoch: 0, Loss:  0.14
Batch: 19, Epoch: 0, Loss:  0.18
Batch: 20, Epoch: 0, Loss:  0.12
Batch: 21, Epoch: 0, Loss:  0.11
Epoch 1/3, Training Loss: 0.3229006454348564
Batch: 0, Epoch: 1, Loss:  0.09
Batch: 1, Epoch: 1, Loss:  0.06
Batch: 2, Epoch: 1, Loss:  0.13
Batch: 3, Epoch: 1, Loss:  0.08
Batch: 4, Epoch: 1, Loss:  0.15
Batch: 5, Epoch: 1, Loss:  0.11
Batch: 6, Epoch: 1, Loss:  0.11
Batch: 7, Epoch

# Testing Part

In [107]:
# Evaluation
model.eval()
total_val_loss = 0
correct_prediction = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device, dtype = torch.float)

        outputs = model(input_ids, attention_mask).squeeze()
        loss= criterion(outputs, labels.view_as(outputs)) 
        total_val_loss+=loss.item()

        preds = (outputs > 0.5).float()
        correct_prediction += torch.sum(preds == labels) 

avg_test_loss = total_val_loss / len(test_loader)
val_accuracy = correct_prediction.double() / len(test_dataset) 
print(f"Validation loss: {avg_test_loss}, Validation accuracy: {val_accuracy:4f}")  

Validation loss: 0.07798121310770512, Validation accuracy: 0.971429


# Testing with Real World Data

In [108]:
def prediction(model, text, max_length=128):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Tokenize input text
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    model = model.to(device)
    model.eval()
    with torch.no_grad():
        output = model(input_ids, attention_mask).squeeze()
        prediction = (output > 0.5).float().item()
        return 'spam' if prediction == 1 else 'ham'

In [109]:
prediction(model, " you win a lottery! click here")

'spam'

In [110]:
prediction(model, " We will meet for a talk")

'ham'

In [111]:
prediction(model, " congratulation! you got cash prize  of $1000000000 ")

'spam'

In [112]:
prediction(model, " You want to make new frined, click our websites ")

'spam'

In [113]:
prediction(model, " If you want to come upstaires you have to click lift button ")

'spam'