In [70]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from openpyxl.styles.builtins import output
from torch.nn import CrossEntropyLoss, BCELoss
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from torchvision import datasets
from torchvision.transforms import ToTensor
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torchvision
import torchvision.transforms as transforms
from transformers import BertTokenizer, BertModel

df = pd.read_csv("spam.csv")

In [71]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [72]:
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [73]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [74]:
df['Category'] = df['Category'].map({'spam':1, 'ham':0})
df['Category'].value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [75]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [76]:
X = df['Message']
y = df['Category']

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
def tokenize_function(texts, labels):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encoded_dict = tokenizer(
        texts,
        max_length = 128,
        padding = 'max_length',
        truncation = True,
        return_tensors = 'pt',
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask'], torch.tensor(labels)
tokenize_function(['hello, may I help you with this?'], [1,0])

(tensor([[ 101, 7592, 1010, 2089, 1045, 2393, 2017, 2007, 2023, 1029,  102,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [105]:
X_train.head().values.tolist()

['Reply to win £100 weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end service',
 'Hello. Sort of out in town already. That . So dont rush home, I am eating nachos. Will let you know eta.',
 'How come guoyang go n tell her? Then u told her?',
 'Hey sathya till now we dint meet not even a single time then how can i saw the situation sathya.',
 'Orange brings you ringtones from all time Chart Heroes, with a free hit each week! Go to Ringtones & Pics on wap. To stop receiving these tips reply STOP.']

In [80]:
train_inputs_ids, train_attention_masks, train_labels = tokenize_function(X_train.values.tolist(), y_train.values.tolist())
test_inputs_ids, test_attention_masks, test_labels = tokenize_function(X_test.values.tolist(), y_test.values.tolist())

In [81]:
train_tensor = TensorDataset(train_inputs_ids, train_attention_masks, train_labels)
test_tensor = TensorDataset(test_inputs_ids, test_attention_masks, test_labels)

In [82]:
train_loader = DataLoader(train_tensor, batch_size=64, shuffle=True)
val_loader = DataLoader(test_tensor, batch_size=64, shuffle=False)

In [83]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.config.hidden_size

768

### Define training model

In [84]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        for param in self.bert.parameters():
            param.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids, attention_mask)
        sentence_embedding = bert_output.last_hidden_state[:,0,:]
        return self.classifier(sentence_embedding)


In [85]:
model = BertClassifier().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [89]:
model.train()
epochs = 2
for epoch in range(epochs):
    total_loss = 0
    for batch_idx, (input_ids, attention_mask, labels) in enumerate(train_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device, dtype=torch.float32)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(f'Batch: {batch_idx+1}, epoch : {epoch+1}, loss :{loss.item():.4f}')

    print(f'Epoch {epoch+1}/{epochs} , total loss : {total_loss/len(train_loader)}')

Batch: 1, epoch : 1, loss :0.3716
Batch: 2, epoch : 1, loss :0.3762
Batch: 3, epoch : 1, loss :0.2660
Batch: 4, epoch : 1, loss :0.2616
Batch: 5, epoch : 1, loss :0.4247
Batch: 6, epoch : 1, loss :0.3286
Batch: 7, epoch : 1, loss :0.2416
Batch: 8, epoch : 1, loss :0.2449
Batch: 9, epoch : 1, loss :0.2108
Batch: 10, epoch : 1, loss :0.2330
Batch: 11, epoch : 1, loss :0.1832
Batch: 12, epoch : 1, loss :0.1641
Batch: 13, epoch : 1, loss :0.1727
Batch: 14, epoch : 1, loss :0.1699
Batch: 15, epoch : 1, loss :0.1124
Batch: 16, epoch : 1, loss :0.1108
Batch: 17, epoch : 1, loss :0.0901
Batch: 18, epoch : 1, loss :0.1578
Batch: 19, epoch : 1, loss :0.1497
Batch: 20, epoch : 1, loss :0.1649
Batch: 21, epoch : 1, loss :0.1988
Batch: 22, epoch : 1, loss :0.0656
Batch: 23, epoch : 1, loss :0.0398
Batch: 24, epoch : 1, loss :0.1034
Batch: 25, epoch : 1, loss :0.1579
Batch: 26, epoch : 1, loss :0.1451
Batch: 27, epoch : 1, loss :0.1155
Batch: 28, epoch : 1, loss :0.0773
Batch: 29, epoch : 1, loss :0

In [94]:
model.eval()
with torch.no_grad():
    correct = 0
    for input_ids, attention_mask, labels in val_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device, dtype=torch.float32)

        outputs = model(input_ids, attention_mask).squeeze()
        pred = (outputs > 0.5).float()
        correct += torch.sum(pred == labels)

accuracy = correct *100 /len(test_tensor)
print(f'Accuracy : {accuracy:.4f}')

Accuracy : 98.2960


In [95]:
print(outputs)

tensor([1.6059e-03, 2.1905e-02, 1.0662e-02, 9.9504e-01, 4.5692e-05, 3.0903e-03,
        9.7651e-01, 2.9923e-04, 3.3844e-04, 2.6123e-03, 2.5721e-04, 6.6899e-04,
        1.9802e-02, 1.1549e-05, 9.8920e-01, 4.8912e-04, 6.9856e-04, 9.9554e-01,
        4.0966e-04, 9.5337e-01, 4.3740e-05, 2.6031e-04, 1.2137e-05, 7.0402e-07,
        4.6834e-04, 1.5824e-03, 5.2284e-04])


In [98]:
print(model(input_ids, attention_mask))

tensor([[1.6059e-03],
        [2.1905e-02],
        [1.0662e-02],
        [9.9504e-01],
        [4.5692e-05],
        [3.0903e-03],
        [9.7651e-01],
        [2.9923e-04],
        [3.3844e-04],
        [2.6123e-03],
        [2.5721e-04],
        [6.6899e-04],
        [1.9802e-02],
        [1.1549e-05],
        [9.8920e-01],
        [4.8912e-04],
        [6.9856e-04],
        [9.9554e-01],
        [4.0966e-04],
        [9.5337e-01],
        [4.3740e-05],
        [2.6031e-04],
        [1.2137e-05],
        [7.0402e-07],
        [4.6834e-04],
        [1.5824e-03],
        [5.2284e-04]], grad_fn=<SigmoidBackward0>)


In [104]:
def get_text_detection(model, text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encode = tokenizer(
        text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
    )
    model.eval()
    input_ids = encode['input_ids'].to(device)
    attention_mask = encode['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask).squeeze()
        pred = (outputs > 0.5).float().item()
        if pred == 0:
            return 'ham'
        else:
            return 'spam'

get_text_detection(model,"hello, how are you?")

'ham'