In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('data/ham-spam/spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df = df[['v1', 'v2']]
df = df.rename(index=str, columns={'v1': 'labels', 'v2': 'text'})
df.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.shape, test.shape

((4457, 2), (1115, 2))

In [7]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [8]:
train.to_csv('data/ham-spam/train.csv', index=False)
test.to_csv('data/ham-spam/test.csv', index=False)

In [10]:
!ls data/ham-spam/

spam.csv
test.csv
train.csv


In [19]:
import numpy as np
import torch
import torchtext

from torchtext.legacy.data import TabularDataset, Field, BucketIterator, LabelField

In [18]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Safiuddin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
TEXT = Field(tokenize=word_tokenize)
LABEL = LabelField(dtype=torch.float)

In [21]:
datafields = [("labels", LABEL), ("text", TEXT)]

In [22]:
trn, tst = TabularDataset.splits(path='data/ham-spam/',
                                train='train.csv',
                                test='test.csv',
                                format='csv',
                                skip_header=True,
                                fields=datafields)

In [23]:
trn[:5]

[<torchtext.legacy.data.example.Example at 0x2c51981f608>,
 <torchtext.legacy.data.example.Example at 0x2c51981f748>,
 <torchtext.legacy.data.example.Example at 0x2c51981f6c8>,
 <torchtext.legacy.data.example.Example at 0x2c51981f9c8>,
 <torchtext.legacy.data.example.Example at 0x2c519805488>]

In [24]:
print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

Number of training examples: 4457
Number of testing examples: 1115


In [25]:
trn[0].__dict__.keys()

dict_keys(['labels', 'text'])

In [26]:
trn[0].text

['No',
 'I',
 "'m",
 'in',
 'the',
 'same',
 'boat',
 '.',
 'Still',
 'here',
 'at',
 'my',
 'moms',
 '.',
 'Check',
 'me',
 'out',
 'on',
 'yo',
 '.',
 'I',
 "'m",
 'half',
 'naked',
 '.']

In [27]:
trn[0].labels

'ham'

In [28]:
TEXT.build_vocab(trn, max_size=10500)

In [29]:
LABEL.build_vocab(trn)

In [30]:
print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 10207
Unique tokens in LABEL vocabulary: 2


In [31]:
print(TEXT.vocab.freqs.most_common(20))

[('.', 3862), ('to', 1750), ('I', 1574), (',', 1468), ('you', 1462), ('?', 1256), ('!', 1134), ('a', 1068), ('the', 946), ('...', 923), ('&', 772), ('i', 760), ('and', 673), ('in', 663), ('is', 647), (';', 641), ('u', 636), ('me', 600), (':', 570), ('..', 544)]


In [32]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']


In [33]:
print(LABEL.vocab.stoi)

defaultdict(None, {'ham': 0, 'spam': 1})


In [34]:
batch_size = 64
train_iterator, test_iterator = BucketIterator.splits(
    (trn, tst),
    batch_size=batch_size,
    sort_key=lambda x: len(x.text),
    sort_within_batch=False)

In [35]:
import torch.nn as nn

In [36]:
class RNN(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        # text: [sentence_length, batch_size]
        embedded = self.embedding(text) # [sentence_length, batch_size, embedding_dim]
        output, hidden = self.rnn(embedded) # output: [sentence_length, batch_size, hidden_dim]
        # hidden: [1, batch_size, hidden_dim]
        hidden_1D = hidden.squeeze(0) # [batch_size, hidden_dim]
        assert torch.equal(output[-1, :, :], hidden_1D)

        return self.fc(hidden_1D)

In [37]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

In [38]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [39]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=1e-6)
criterion = nn.BCEWithLogitsLoss()

In [40]:
def train(model, iterator, optimizer, criterion):

    epoch_loss, epoch_acc = 0, 0
    model.train()

    for batch in iterator:

        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.labels)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float()

        acc = correct.sum() / len(correct)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [41]:
EPOCHS = 5

for epoch in range(1, EPOCHS + 1):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

    print(f'Epoch: {epoch} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')

Epoch: 1 | Train Loss: 0.6714 | Train Acc: 0.8562
Epoch: 2 | Train Loss: 0.6425 | Train Acc: 0.8549
Epoch: 3 | Train Loss: 0.6154 | Train Acc: 0.8567
Epoch: 4 | Train Loss: 0.5912 | Train Acc: 0.8564
Epoch: 5 | Train Loss: 0.5692 | Train Acc: 0.8567


In [42]:
epoch_loss, epoch_acc = 0, 0
model.eval()

with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.labels)
        rounded_preds = torch.round(torch.sigmoid(predictions))

        correct = (rounded_preds == batch.labels).float()
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.4f}')

Test Loss: 0.647 | Test Acc: 0.7089


In [44]:
# LSTM Model
class LSTM(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):

        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        
        hidden_1D = hidden.squeeze(0)
        assert torch.equal(output[-1, :, :], hidden_1D)

        return self.fc(hidden_1D)

In [45]:
model = LSTM(input_dim, embedding_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-6)
criterion = nn.BCEWithLogitsLoss()

EPOCHS = 5

for epoch in range(1, EPOCHS + 1):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

    print(f'Epoch: {epoch} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')

epoch_loss, epoch_acc = 0, 0
model.eval()

with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.labels)
        rounded_preds = torch.round(torch.sigmoid(predictions))

        correct = (rounded_preds == batch.labels).float()
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.4f}')

Epoch: 1 | Train Loss: 0.6949 | Train Acc: 0.4205
Epoch: 2 | Train Loss: 0.6789 | Train Acc: 0.8621
Epoch: 3 | Train Loss: 0.6638 | Train Acc: 0.8599
Epoch: 4 | Train Loss: 0.6492 | Train Acc: 0.8607
Epoch: 5 | Train Loss: 0.6351 | Train Acc: 0.8611
Test Loss: 0.665 | Test Acc: 0.7760


In [46]:
# LSTM Model with Dropout
class LSTM(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, text):

        embedded = self.embedding(text)
        embedded_dropout = self.dropout(embedded)
        output, (hidden, _) = self.lstm(embedded_dropout)
        
        hidden_1D = hidden.squeeze(0)
        assert torch.equal(output[-1, :, :], hidden_1D)

        return self.fc(hidden_1D)

model = LSTM(input_dim, embedding_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-6)
criterion = nn.BCEWithLogitsLoss()

EPOCHS = 5

for epoch in range(1, EPOCHS + 1):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

    print(f'Epoch: {epoch:02} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')

epoch_loss, epoch_acc = 0, 0
model.eval()

with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.labels)
        rounded_preds = torch.round(torch.sigmoid(predictions))

        correct = (rounded_preds == batch.labels).float()
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.4f}')

Epoch: 01 | Train Loss: 0.7062 | Train Acc: 0.2443
Epoch: 02 | Train Loss: 0.6918 | Train Acc: 0.5343
Epoch: 03 | Train Loss: 0.6780 | Train Acc: 0.7822
Epoch: 04 | Train Loss: 0.6644 | Train Acc: 0.8483
Epoch: 05 | Train Loss: 0.6513 | Train Acc: 0.8540
Test Loss: 0.685 | Test Acc: 0.6181


In [47]:
import gc
gc.collect()

118