In [1]:
!pip install opendatasets
!pip install transformers
!pip install sentencepiece
!pip install transformers[torch]

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [2]:
import opendatasets as op
op.download('https://www.kaggle.com/datasets/datatattle/email-classification-nlp', force = True)

Downloading email-classification-nlp.zip to ./email-classification-nlp


100%|██████████| 46.8k/46.8k [00:00<00:00, 26.0MB/s]







In [3]:
import torch
import transformers
import pandas

In [4]:
df = pandas.read_csv('/content/email-classification-nlp/SMS_train.csv',encoding='latin-1')
df = df.drop(columns = ['S. No.'])
df.head(5)

Unnamed: 0,Message_body,Label
0,Rofl. Its true to its name,Non-Spam
1,The guy did some bitching but I acted like i'd...,Non-Spam
2,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,Will ü b going to esplanade fr home?,Non-Spam
4,This is the 2nd time we have tried 2 contact u...,Spam


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Message_body  957 non-null    object
 1   Label         957 non-null    object
dtypes: object(2)
memory usage: 15.1+ KB


In [6]:
df['Label'].value_counts()

Non-Spam    835
Spam        122
Name: Label, dtype: int64

In [7]:
import random
num = 835 - 122
non_spam = df[df['Label'] == 'Non-Spam'].index.tolist()
drop_index = []
while len(drop_index) < num:
  n = random.randint(0, len(non_spam) - 1)
  drop_index.append(non_spam[n])
  del non_spam[n]

In [8]:
df = df.drop(index=drop_index)

In [9]:
df['Label'].value_counts()

Spam        122
Non-Spam    122
Name: Label, dtype: int64

In [10]:
class Email_Dataset(torch.utils.data.Dataset):
  def __init__(self,df):
    self.df = df
    self.label2id = {label : id for id, label in enumerate(df['Label'].unique())}
    self.data = self._get_data()

  def _get_data(self):
    data = []
    for row in self.df.iloc:
      email = row['Message_body']
      label = row['Label']
      label = self.label2id[label]
      label = [1. if x == label else 0. for x in range(2)]
      data.append((email,label))
    return data

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    email, label = self.data[idx]
    return email, label

In [11]:
dataset = Email_Dataset(df[:int(3*len(df)/4)])
val_dataset = Email_Dataset(df[int(3*len(df)/4):])

In [12]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size= 16, shuffle = True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size= 16, shuffle = True)

In [13]:
class Classification_Model(transformers.DistilBertForSequenceClassification):
  def __init__(self):
    pretrained_model = transformers.DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    config = pretrained_model.config
    super(Classification_Model, self).__init__(config)
    state_dict = pretrained_model.state_dict()
    self.tokenizer  = transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    self.load_state_dict(state_dict)
    for p in self.distilbert.parameters():
      p.requires_grad = False
    for p in self.pre_classifier.parameters():
      p.requires_grad = False
    self.softmax = torch.nn.Softmax(dim = -1)

  def forward(self,X):
    o = super(Classification_Model, self).forward(**X)
    o = self.softmax(o.logits)
    return o

In [14]:
model = Classification_Model()

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [16]:
model

Classification_Model(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin

In [26]:
num

11

In [17]:
flat = 0
min_val_loss = float('inf')
for epoch in range(5):
  total_loss = 0
  for num, (email, label) in enumerate(dataloader):
    input = model.tokenizer(email, truncation= True, padding = True, return_tensors = 'pt')
    target = torch.cat((label[0].unsqueeze(0), label[1].unsqueeze(0)), dim = 0).permute(1,0).to(torch.float32)
    o = model(input)

    loss = criterion(o,target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if num%10 == 0:
      print(f'epoch: {epoch},iter: {num}, flat: {flat}')


epoch: 0,iter: 0, flat: 0
epoch: 0,iter: 10, flat: 0
epoch: 1,iter: 0, flat: 0
epoch: 1,iter: 10, flat: 0
epoch: 2,iter: 0, flat: 0
epoch: 2,iter: 10, flat: 0
epoch: 3,iter: 0, flat: 0
epoch: 3,iter: 10, flat: 0
epoch: 4,iter: 0, flat: 0
epoch: 4,iter: 10, flat: 0


In [29]:
df = pandas.read_csv('/content/email-classification-nlp/SMS_test.csv',encoding='latin-1')

In [30]:
df['Label'].value_counts()

Spam        76
Non-Spam    49
Name: Label, dtype: int64

In [20]:
val_dataset = Email_Dataset(df)

In [31]:
c = 0
pred = []
target = []
for email, label in val_dataset:
  c += 1
  inputs = model.tokenizer(email, return_tensors="pt")
  with torch.no_grad():
      logits = model(inputs)

  predicted_class_id = logits.argmax().item()
  pred.append(predicted_class_id)
  target.append(label)
  # if c == 50:
  #   break

In [27]:
torch.save({'model' : model.state_dict(),
            'optimizer' : optimizer.state_dict()}, '/content/drive/MyDrive/Project/spam_classification.pth')

In [32]:
confusion_matrix = [[0 for _ in range(2)] for _ in range(2)]
for p, t in zip(pred, target):
  t = t.index(1.0)
  confusion_matrix[p][t] += 1

In [33]:
print(confusion_matrix)

[[74, 3], [2, 46]]


In [28]:
(75. + 45.)/(75. + 45. + 5.)

0.96