In [1]:
from Tokenizers import WordPiece

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset
from torchmetrics import MetricCollection
from torchmetrics.classification import Accuracy, Recall, Precision, F1Score
from torchinfo import summary

import pandas as pd

from timeit import default_timer as timer
from tqdm import tqdm
from math import ceil
from random import randint

## Setting Device Agnostic Code

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

## Loading the Dataset

In [3]:
df = pd.read_csv("C:\\Users\\jacob\\Documents\\Programming\\Theory\\Python\\AI\\Datasets\\NLP\\sms_spam.csv", encoding="latin-1")[["v1", "v2"]]
df.rename(columns={"v1": "Label", "v2": "Text"}, inplace=True)

df["Label"] = df["Label"].map({
    "ham": 0,
    "spam": 1
})

df

Unnamed: 0,Label,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [4]:
df["Label"].value_counts()

Label
0    4825
1     747
Name: count, dtype: int64

## Converting Dataset to Lower Case

In [5]:
df["Cl_Text"] = df["Text"].apply(lambda x: x.lower())

In [6]:
df

Unnamed: 0,Label,Text,Cl_Text
0,0,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,0,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s...","pity, * was in mood for that. so...any other s..."
5570,0,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i'd...


## Converting Dataset into List of Strings

In [7]:
corpus = list(df.Cl_Text)

print(len(corpus))
print(corpus[10])

5572
i'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? i've cried enough today.


## Creating Vocabulary

In [8]:
w = WordPiece(corpus=corpus, ntokens=1_000, cleaning=lambda text: text.lower())

In [9]:
w.fit()

Creating Vocabulary: 100%|██████████| 847/847 [00:54<00:00, 15.42it/s]


In [10]:
print(w.vocab_d)

{'[CLS]': 0, '[UNK]': 1, '[PAD]': 2, '[SEP]': 3, '!': 4, '#': 5, '##!': 6, '##!\x8eö´\x89ó_': 7, '##!\x8eö´\x89ó_?': 8, '##!\x8eö´\x89ó_??': 9, '##!\x8eö´\x89ó_??\x8bû¬': 10, '##"': 11, '###': 12, '###&': 13, '##$': 14, '##$7': 15, '##$70': 16, '##$700': 17, '##%': 18, '##&': 19, "##'": 20, '##(': 21, '##(å£': 22, '##)': 23, '##*': 24, '##**': 25, '##****': 26, '##******': 27, '##**********': 28, '##**************': 29, '##****7': 30, '##*å£1': 31, '##+': 32, '##+6*å£1': 33, '##+6+': 34, '##+å£1': 35, '##,': 36, '##-': 37, '##-$': 38, '##-$9': 39, '##-$90': 40, '##-$900': 41, '##-(': 42, '##-)': 43, '##-|': 44, '##-å£': 45, '##-å£5': 46, '##.': 47, '##/': 48, '##/~': 49, '##/ì¼': 50, '##/ì¼1': 51, '##0': 52, '##0-å£5': 53, '##01216+': 54, '##1': 55, '##1(å£': 56, '##11(å£': 57, '##1216+': 58, '##141701216+': 59, '##16+': 60, '##16+1': 61, '##16+å£1': 62, '##1701216+': 63, '##18+': 64, '##18+)': 65, '##2': 66, '##216+': 67, '##2228>>': 68, '##228>>': 69, '##2735=å£45': 70, '##28>>': 71,

In [11]:
vocab_size = len(w.vocab_l)

vocab_size

1004

## Encoding the Text

In [12]:
# Getting max text length for padding
max_len = df["Cl_Text"].str.len().max()

max_len

910

In [13]:
max_len += 1 # due to [SEP] at the end of each sentence

In [14]:
df["En_Text"] = df["Cl_Text"].apply(lambda x: [w.vocab_d["[CLS]"]] + w.encode(text=x, npad=max_len))

In [15]:
max_len += 1 # due to the CLS token

In [16]:
df

Unnamed: 0,Label,Text,Cl_Text,En_Text
0,0,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ...","[0, 936, 159, 3, 950, 158, 165, 153, 156, 3, 9..."
1,0,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...,"[0, 944, 155, 3, 941, 144, 163, 47, 47, 47, 3,..."
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[0, 935, 163, 149, 149, 3, 933, 158, 165, 163,..."
3,0,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...,"[0, 950, 3, 932, 166, 158, 3, 948, 144, 171, 3..."
4,0,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro...","[0, 943, 144, 152, 3, 938, 3, 932, 159, 158, 2..."
...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,this is the 2nd time we have tried 2 contact u...,"[0, 949, 152, 153, 164, 3, 938, 164, 3, 949, 1..."
5568,0,Will Ì_ b going to esplanade fr home?,will ì_ b going to esplanade fr home?,"[0, 952, 153, 156, 156, 3, 1001, 3, 930, 3, 93..."
5569,0,"Pity, * was in mood for that. So...any other s...","pity, * was in mood for that. so...any other s...","[0, 945, 153, 165, 171, 36, 3, 252, 3, 952, 14..."
5570,0,The guy did some bitching but I acted like i'd...,the guy did some bitching but i acted like i'd...,"[0, 949, 152, 149, 3, 936, 166, 171, 3, 932, 1..."


In [17]:
print(df["En_Text"].str.len().max())
print(max_len)

912
912


## Converting DataFrame to Pytorch Dataset

In [18]:
class FinancialNewsDataset(Dataset):
    def __init__(self, dataframe, classes):
        super().__init__()

        self.samples = [(dataframe["En_Text"][i], dataframe["Label"][i]) for i in range(len(dataframe))]
        self.classes = classes
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        self.idx_to_classes = {i: c for i, c in enumerate(self.classes)}
    
    def __getitem__(self, index):
        if isinstance(index, slice):
            return [(torch.tensor(sample[0], dtype=torch.long), sample[1]) for sample in self.samples[index]] # List (Tuple (Tensor, Int) )
        return (torch.tensor(self.samples[index][0], dtype=torch.long), self.samples[index][1])               # Tuple (Tensor, Int)

    def __len__(self):
        return len(self.samples)

In [19]:
ds = FinancialNewsDataset(df, ["ham", "spam"])

print(len(ds))
print(ds[1:3])
print(ds[9])

5572
[(tensor([  0, 944, 155,   3, 941, 144, 163,  47,  47,  47,   3, 939, 159, 155,
        153, 158, 151,   3, 952, 153, 150,   3, 950,   3, 944, 158, 153,  47,
         47,  47,   3,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,

In [20]:
classes = ds.classes
class_to_idx = ds.class_to_idx
idx_to_classes = ds.idx_to_classes

class_to_idx

{'ham': 0, 'spam': 1}

## Splitting the Dataset into Training and Testing Sets

In [21]:
train_size = int(len(ds) * 0.95)
 
train_ds = ds[:train_size]
test_ds = ds[train_size:]

print(len(train_ds), len(test_ds))

5293 279


## Creating DataLoader Class

In [22]:
class Loader:
    def __init__(self, ds, batch_size, shuffle):
        self.batch_size = batch_size
        self.shuffle = shuffle

        self._dsx = [s[0].tolist() for s in ds] # contains the x-values (inputs) of the dataset | `tolist()` converts the x-tensor into Python List | List (List (Int) )
        self._dsy = [s[1] for s in ds]          # contains the y-values (targets) of the dataset | List (Int)

        if shuffle:
            self._temp_dsx = self._dsx.copy() 
            self._temp_dsy = self._dsy.copy()

    def __iter__(self):
        if self.shuffle:
            # Iterating over the number of batches that the dataset is going to bet split
            for _ in range(len(self._dsx) // self.batch_size):
                
                # This random index gives the index of the first sample for the batch
                ridx = randint(0, len(self._temp_dsx) - self.batch_size)

                yield (torch.tensor(self._temp_dsx[ridx: ridx + self.batch_size], dtype=torch.long), torch.tensor(self._temp_dsy[ridx: ridx + self.batch_size], dtype=torch.long))

                # Removing the already `yield`ed batch from the dataset
                self._temp_dsx = self._temp_dsx[:ridx] + self._temp_dsx[ridx + self.batch_size:]
                self._temp_dsy = self._temp_dsy[:ridx] + self._temp_dsy[ridx + self.batch_size:]

            # Returning the last batch, which is not going to contain `batch_size` samples
            if len(self._temp_dsx) > 0:
                yield (torch.tensor(self._temp_dsx, dtype=torch.long), torch.tensor(self._temp_dsy, dtype=torch.long))

            # If we try to iterate again over the loader without those two lines, no samples are going to be returned
            self._temp_dsx = self._dsx.copy()
            self._temp_dsy = self._dsy.copy()

        else:
            j = 0
            for _ in range(ceil(len(self._dsx) / self.batch_size)):
                yield (torch.tensor(self._dsx[j: j + self.batch_size], dtype=torch.long), torch.tensor(self._dsy[j: j + self.batch_size], dtype=torch.long))
                j += self.batch_size

    def __len__(self):
        return ceil(len(self._dsx) / self.batch_size)

## Creating the Cross Validation Function

In [23]:
def cross_validation(ds, valid_prop, batch_size):
    valid_size = int(len(ds) * valid_prop)
    ridx = randint(0, len(ds) - valid_size)

    return (Loader(ds[ridx: ridx + valid_size], batch_size=batch_size, shuffle=False), Loader(ds[:ridx] + ds[ridx + valid_size:], batch_size=batch_size, shuffle=True))

## Creating Model's Functions

In [24]:
class ModelUtils(nn.Module):
    def __init__(self):
        super().__init__()
    

    def __training_step(self, train_dl, opt, device):
        losses = torch.zeros(len(train_dl), device=device)
        for i, (x_train, y_train) in enumerate(train_dl):
            x_train, y_train = x_train.to(device), y_train.to(device)

            _, loss = self(x_train, y_train)
            losses[i] = loss.item()

            opt.zero_grad()
            loss.backward()
            opt.step()

        return losses.mean().item()


    @torch.inference_mode()
    def __validation_step(self, valid_dl, device):
        self.eval()
        losses = torch.zeros(len(valid_dl), device=device)
        for i, (x_train, y_train) in enumerate(valid_dl):
            x_train, y_train = x_train.to(device), y_train.to(device)

            _, loss = self(x_train, y_train)
            losses[i] = loss.item()

        self.train()
        return losses.mean().item()


    def fit(self, epochs, train_ds, opt):
        start_time = timer()
        device = next(self.parameters()).device
        train_losses, valid_losses = [], []

        t = tqdm(range(1, epochs + 1), desc="Training Model: ")
        t.set_postfix({"train_loss": "inf", "valid_loss": "inf"})
        for _ in t:
            valid_dl, train_dl = cross_validation(train_ds, valid_prop=0.2, batch_size=32)
        
            train_loss = self.__training_step(train_dl, opt, device)
            valid_loss = self.__validation_step(valid_dl, device)

            train_losses.append(train_loss)
            valid_losses.append(valid_loss)

            t.set_postfix({"train_loss": train_loss, "valid_loss": valid_loss})
            t.refresh()

        return {"model_train_loss": train_losses,
            "model_valid_loss": valid_losses,
            "model_name": self.__class__.__name__,
            "model_optimizer": opt.__class__.__name__,
            "model_device": device.type,
            "model_epochs": epochs,
            "model_time": timer() - start_time}


    @torch.inference_mode()
    def evaluate(self, dl):
        self.eval()

        device = next(self.parameters()).device
        metric_collection = MetricCollection([
            Accuracy(task="multiclass", num_classes=2, average="macro"),
            Precision(task="multiclass", num_classes=2, average="macro"),
            Recall(task="multiclass", num_classes=2, average="macro"),
            F1Score(task="multiclass", num_classes=2, average="macro")
        ]).to(device)
        losses = torch.zeros(len(dl))

        for i, (xb, yb) in enumerate(dl):
            xb, yb = xb.to(device), yb.to(device)

            logits, loss = self(xb, yb)
            preds = F.softmax(logits, dim=-1)

            metric_collection.update(preds[:, 0, :], yb)
            losses[i] = loss.item()
        
        res = metric_collection.compute()
        
        self.train()
        return losses.mean().item(), res["MulticlassAccuracy"].item(), res["MulticlassPrecision"].item(), res["MulticlassRecall"].item(), res["MulticlassF1Score"].item()

## Creating the Model

In [25]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, head_size, dropout):
        super().__init__()

        self.query = nn.Linear(embed_size, head_size, bias=False)
        self.key = nn.Linear(embed_size, head_size, bias=False)
        self.value = nn.Linear(embed_size, head_size, bias=False)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        _, _, C = x.shape

        q = self.query(x) # (B, T, head_size)
        k = self.key(x)   # (B, T, head_size)

        wei = q @ k.transpose(-2, -1) * (C**-0.5) # (B, T, head_size) @ (B, head_size, T) --> (B, T, T)
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x) # (B, T, head_size)

        out = wei @ v # (B, T, T) @ (B, T, head_size) --> (B, T, head_size)

        return out
    
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, embed_size, head_size, dropout):
        super().__init__()

        self.heads = nn.ModuleList([SelfAttention(embed_size, head_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(embed_size, embed_size, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Contatenate the outputs of each Masked Self-Attention
        out = torch.cat([head(x) for head in self.heads], dim=-1) # (B, T, EMBED_SIZE)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
    def __init__(self, embed_size, scale_embeds, dropout):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(embed_size, scale_embeds * embed_size),
            nn.ReLU(),
            nn.Linear(scale_embeds * embed_size, embed_size),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x) # (B, T, EMBED_SIZE)
    
class Block(nn.Module): # combining Masked Multi-Head Attention and one Feed-Forward layer
    def __init__(self, embed_size, scale_embeds, num_heads, dropout):
        super().__init__()

        head_size = embed_size // num_heads # because the result of the Masked Multi-Head layer we want to have shape: (B, T, EMBED_SIZE)
        self.multi_att_m = MultiHeadAttention(num_heads, embed_size, head_size, dropout)
        self.ffwd = FeedForward(embed_size, scale_embeds, dropout)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x): 
        x = x + self.multi_att_m(self.ln1(x)) # (B, T, EMBED_SIZE)
        x = x + self.ffwd(self.ln2(x))        # (B, T, EMBED_SIZE)

        return x
    
class TransformerEncoder(ModelUtils):
    def __init__(self, embed_size, num_layers, scale_embeds, num_heads, dropout):
        super().__init__()

        self.embedding_table = nn.Embedding(vocab_size, embed_size)
        # Self-Attention doesn't take into consideration the position of tokens when computing the attetnion matrix, so we have to
        self.position_embedding_table = nn.Embedding(max_len, embed_size)
        self.block = nn.Sequential(*[Block(embed_size, scale_embeds, num_heads, dropout) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(embed_size)
        self.linear_head = nn.Linear(embed_size, 2)        

    def forward(self, idx, targets=None):
        token_embeddings = self.embedding_table(idx)                                              # (B, T, EMBED_SIZE)
        position_embeddings = self.position_embedding_table(torch.arange(max_len, device=device)) # (T, EMBED_SIZE)

        x = token_embeddings + position_embeddings # (B, T, EMBED_SIZE)

        x = self.block(x) # (B, T, EMBED_SIZE)
        x = self.ln_f(x)  # (B, T, EMBED_SIZE)

        logits = self.linear_head(x) # (B, T, 2)

        # Condition to seperate training and generating phase
        loss = F.cross_entropy(logits[:, 0, :], targets) if targets is not None else None

        return logits, loss

In [26]:
EMBED_SIZE = 100
NUM_LAYERS = 2
SCALE_EMBEDS = 1
NUM_HEADS = 2
DROPOUT = 0.1

model = TransformerEncoder(EMBED_SIZE, NUM_LAYERS, SCALE_EMBEDS, NUM_HEADS, DROPOUT).to(device)

summary(model=model,
        input_size=(1, max_len),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        dtypes=[torch.int64])

Layer (type:depth-idx)                        Input Shape               Output Shape              Param #                   Trainable
TransformerEncoder                            [1, 912]                  [1, 912, 2]               --                        True
├─Embedding: 1-1                              [1, 912]                  [1, 912, 100]             100,400                   True
├─Embedding: 1-2                              [912]                     [912, 100]                91,200                    True
├─Sequential: 1-3                             [1, 912, 100]             [1, 912, 100]             --                        True
│    └─Block: 2-1                             [1, 912, 100]             [1, 912, 100]             --                        True
│    │    └─LayerNorm: 3-1                    [1, 912, 100]             [1, 912, 100]             200                       True
│    │    └─MultiHeadAttention: 3-2           [1, 912, 100]             [1, 912, 100]       

## Training the Model

In [27]:
opt = optim.AdamW(model.parameters(), lr=1e-3)

EPOCHS = 5

res = model.fit(EPOCHS, train_ds, opt)

Training Model: 100%|██████████| 5/5 [13:24<00:00, 160.94s/it, train_loss=0.0714, valid_loss=0.0703]


In [28]:
model.evaluate(Loader(test_ds, batch_size=32, shuffle=False))

(0.07389478385448456,
 0.9178264141082764,
 0.9555172920227051,
 0.9178264141082764,
 0.9355807304382324)