In [149]:
from torch.utils.data import Dataset
import pandas as pd

class SpamDataset(Dataset):
    def __init__(self, path):
        self.path = path
        df = pd.read_csv(self.path, encoding='ISO-8859-1')
        df = df.dropna()
        self.features = df["features"].values
        self.labels = df["labels"].values

    def __getitem__(self, index):
        feature = self.features[index]
        label = self.labels[index]
        return feature, label
    
    def __len__(self):
        return len(self.labels)

In [150]:
dataset = SpamDataset('spam_new.csv')
for feature, label in dataset:
    print(feature, label)

go jurong point crazi avail bugi n great world la e buffet cine got amor wat 0
ok lar joke wif u oni 0
free entri wkli comp win fa cup final tkt st may text fa receiv entri questionstd txt ratetc appli over 1
u dun say earli hor u c alreadi say 0
nah dont think goe usf live around though 0
freemsg hey darl week word back id like fun still tb ok xxx std chg send Ã¥ rcv 1
even brother like speak treat like aid patent 0
per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun 0
winner valu network custom select receivea Ã¥ prize reward claim call claim code kl valid hour 1
mobil month u r entitl updat latest colour mobil camera free call mobil updat co free 1
im gonna home soon dont want talk stuff anymor tonight k ive cri enough today 0
six chanc win cash pound txt csh send cost pday day tsandc appli repli hl info 1
urgent week free membership Ã¥ prize jackpot txt word claim tc wwwdbuknet lccltd pobox ldnwarw 1
ive search right word thank b

In [151]:
from torch.utils.data.dataset import random_split

train_dataset, test_dataset, valid_dataset = random_split(list(dataset), [4175, 1113, 278])

print(len(train_dataset))
print(len(test_dataset))
print(len(valid_dataset))

4175
1113
278


In [152]:
from collections import Counter

token_counts = Counter()

for feature, label in train_dataset:
    token_counts.update(feature.split())

print('Vocab-size:', len(token_counts))


Vocab-size: 6085


In [153]:
from torchtext.vocab import vocab
from collections import OrderedDict

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

In [154]:
print([vocab[token] for token in ['sex', 'free', 'call']])

[564, 9, 3]


In [155]:
text_pipeline = lambda x: [vocab[token] for token in x.split()]

In [156]:
import torch.nn as nn
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, feature_list, lengths = [], [], []
    for _feature, _label in batch:
        label_list.append(_label)
        processed_feature = torch.tensor(text_pipeline(_feature), dtype=torch.int64)
        feature_list.append(processed_feature)
        lengths.append(processed_feature.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float)
    lengths = torch.tensor(lengths, dtype=torch.float)
    padded_text_list = nn.utils.rnn.pad_sequence(feature_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [157]:
feature_batch, label_batch, length_batch = next(iter(dataloader))
print(feature_batch)
print(label_batch)
print(length_batch)

tensor([[  11,   85,    2,   90,   94,  248],
        [1082, 1083,  142, 2802,    0,    0],
        [   3,   91,   31,  952,  414,    0],
        [   6,  186,  484,    0,    0,    0]], device='cuda:0')
tensor([0., 0., 0., 0.], device='cuda:0')
tensor([6., 4., 5., 3.], device='cuda:0')


In [158]:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [159]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)

        return out

In [160]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64


torch.manual_seed(1)


model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model.to(device)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [161]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for feature_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(feature_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for feature_batch, label_batch, lengths in dataloader:
            pred = model(feature_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [162]:
num_epochs = 20

torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.8343 val_accuracy: 0.9245
Epoch 1 accuracy: 0.9241 val_accuracy: 0.9424
Epoch 2 accuracy: 0.9490 val_accuracy: 0.9604
Epoch 3 accuracy: 0.9674 val_accuracy: 0.9568
Epoch 4 accuracy: 0.9749 val_accuracy: 0.9712
Epoch 5 accuracy: 0.9844 val_accuracy: 0.9640
Epoch 6 accuracy: 0.9892 val_accuracy: 0.9676
Epoch 7 accuracy: 0.9938 val_accuracy: 0.9712
Epoch 8 accuracy: 0.9959 val_accuracy: 0.9640
Epoch 9 accuracy: 0.9969 val_accuracy: 0.9784
Epoch 10 accuracy: 0.9947 val_accuracy: 0.9748
Epoch 11 accuracy: 0.9993 val_accuracy: 0.9784
Epoch 12 accuracy: 0.9993 val_accuracy: 0.9784
Epoch 13 accuracy: 0.9995 val_accuracy: 0.9784
Epoch 14 accuracy: 0.9998 val_accuracy: 0.9784
Epoch 15 accuracy: 0.9998 val_accuracy: 0.9784
Epoch 16 accuracy: 0.9998 val_accuracy: 0.9784
Epoch 17 accuracy: 0.9998 val_accuracy: 0.9784
Epoch 18 accuracy: 1.0000 val_accuracy: 0.9784
Epoch 19 accuracy: 1.0000 val_accuracy: 0.9784


In [163]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}') 

test_accuracy: 0.9748
