In [28]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import spacy
import torch
from torchtext import data
import torch.nn as nn
import re

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
import re
import nltk
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import WordNetLemmatizer
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
import torchtext
from torchtext import data
import torch.nn.functional as F
from torchtext.vocab import Vectors
from sklearn.model_selection import train_test_split

In [41]:
from torchtext.data import Field


In [110]:
spacy_en=spacy.load('en_core_web_sm')

stop_words=spacy.lang.en.STOP_WORDS
PUNCT_TO_REMOVE = string.punctuation
lemmatizer = WordNetLemmatizer()

# First checking if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device)

Device:  cuda


In [111]:
spam=pd.read_csv(r"spam.csv", engine='python')

In [112]:
spam.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)

In [113]:
spam = spam.drop_duplicates()
print('Shape after removing duplicates: ', spam.shape)
spam = spam.dropna()
print('Shape after removing null values: ', spam.shape)

Shape after removing duplicates:  (5169, 2)
Shape after removing null values:  (5169, 2)


In [114]:
spam['text'] = spam['text'].str.lower()

# removing punctuation marks
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
spam["text"] = spam["text"].apply(lambda text: remove_punctuation(text))


# Removing stopwords
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOP_WORDS])
spam["text"] = spam["text"].apply(lambda text: remove_stopwords(text))

        
# Lemmatizing the text data
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
spam["text"] = spam["text"].apply(lambda text: lemmatize_words(text))

# to clean data
def normalise_text (text):
    text = text.str.lower() # lowercase
    text = text.str.replace(r"\#","") # replaces hashtags
    text = text.str.replace(r"http\S+","URL")  # remove URL addresses
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
    text = text.str.replace("\s{2,}", " ")
    return text

spam["text"] = normalise_text(spam["text"])

In [115]:
print('Shape of the preprossed dataset: ', spam.shape)

spam = spam.drop_duplicates()
print('Shape after removing duplicates from the preprossed dataset: ', spam.shape)
spam = spam[spam['text'] != ""]
print('Shape after removing datapoints with empty text: ', spam.shape)

Shape of the preprossed dataset:  (5169, 2)
Shape after removing duplicates from the preprossed dataset:  (5093, 2)
Shape after removing datapoints with empty text:  (5092, 2)


In [116]:
spam['category'] = spam['category'].map({'spam': 1, 'ham': 0})


In [117]:
def tokenizer(text): # create a tokenizer function
    """ A function for tokenization"""
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [118]:
train_df, valid_df = train_test_split(spam)


In [119]:
TEXT = data.Field(tokenize = tokenizer, include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [120]:
class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.category if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [121]:
# creating torchtext fields
fields = [('text',TEXT), ('category',LABEL)]

# Making tabular datasets
train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=valid_df)

# Lets look at a random example
print(vars(train_ds[15]))
# Check the type 
print(type(train_ds[15]))

{'text': ['pls', 'come', 'quick', 'ca', 'nt', 'bare'], 'category': 0}
<class 'torchtext.data.example.Example'>


In [122]:
MAX_VOCAB_SIZE = 20000

TEXT.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = 'glove.6B.100d',
                 unk_init = torch.Tensor.zero_)

LABEL.build_vocab(train_ds)

In [123]:
BATCH_SIZE = 64

train_iter, valid_iter = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [136]:
num_epochs = 50
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
# padding
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [137]:
class LSTM_net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        embedded = self.embedding(text)
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)       
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output = self.fc1(hidden)
        output = self.dropout(self.fc2(output))
            
        return output

In [138]:
model = LSTM_net(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [139]:

pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
#  to initiaise padded to zeros
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

torch.Size([7191, 100])


In [140]:
model.to(device) 

# Defining Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [141]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch or iterators
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [142]:
def train(model, iterator):
    """
    Training function
    model: The LSTM_net model defined above
    iterator: train and validation iterators with batch sizes
    """
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        text, text_lengths = batch.text
        
        optimizer.zero_grad()
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.category)
        acc = binary_accuracy(predictions, batch.category)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [143]:
def evaluate(model, iterator):
    """
    Function for evaluation of the trained model
    model: The LSTM_net model defined above
    iterator: train and validation iterators with batch sizes
    """
    
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            acc = binary_accuracy(predictions, batch.category)
            
            epoch_acc += acc.item()
        
    return epoch_acc / len(iterator)

In [144]:
t = time.time()
loss=[]
acc=[]
val_acc=[]

# Running the model and logging the results
for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iter)
    valid_acc = evaluate(model, valid_iter)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)
    
print(f'time:{time.time()-t:.3f}')

	Train Loss: 0.413 | Train Acc: 89.65%
	 Val. Acc: 87.42%
	Train Loss: 0.230 | Train Acc: 94.66%
	 Val. Acc: 93.33%
	Train Loss: 0.198 | Train Acc: 95.69%
	 Val. Acc: 90.71%
	Train Loss: 0.158 | Train Acc: 97.11%
	 Val. Acc: 93.83%
	Train Loss: 0.148 | Train Acc: 97.45%
	 Val. Acc: 96.49%
	Train Loss: 0.146 | Train Acc: 97.59%
	 Val. Acc: 95.42%
	Train Loss: 0.141 | Train Acc: 97.57%
	 Val. Acc: 96.55%
	Train Loss: 0.138 | Train Acc: 97.49%
	 Val. Acc: 96.64%
	Train Loss: 0.147 | Train Acc: 97.73%
	 Val. Acc: 96.50%
	Train Loss: 0.133 | Train Acc: 97.63%
	 Val. Acc: 96.50%
	Train Loss: 0.136 | Train Acc: 97.50%
	 Val. Acc: 96.57%
	Train Loss: 0.136 | Train Acc: 97.63%
	 Val. Acc: 96.57%
	Train Loss: 0.139 | Train Acc: 97.32%
	 Val. Acc: 96.57%
	Train Loss: 0.142 | Train Acc: 97.45%
	 Val. Acc: 96.57%
	Train Loss: 0.142 | Train Acc: 97.68%
	 Val. Acc: 96.57%
	Train Loss: 0.136 | Train Acc: 97.71%
	 Val. Acc: 96.57%
	Train Loss: 0.144 | Train Acc: 97.73%
	 Val. Acc: 96.57%
	Train Loss: 0