In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tqdm

import re
import string
from unidecode import unidecode
import emoji

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F



import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

## Data loading

In [8]:
file_train = 'train_spam.csv'

data_train_all = pd.read_csv(file_train)

In [9]:
texts_train_all = data_train_all['text']
labels_all = data_train_all['text_type']
target_all = pd.Categorical(labels_all, categories=['ham', 'spam']).codes

In [10]:
lemmatizer = WordNetLemmatizer()

tokenizer = TweetTokenizer()

ENGLISH_STOP_WORDS = set(stopwords.words('english'))

In [11]:
def remove_hyperlink(words):
    return  re.sub(r"http\S+", "", words)

def to_lower(words):
    result = words.lower()
    return result

def remove_punctuation(words):
    # result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    punctuation_pattern = r'[^\w\s]'
    return re.sub(punctuation_pattern, ' ', words)

def emoji_to_text(words):
    return emoji.demojize(words)

def remove_whitespace(words):
    result = words.strip()
    return result

def replace_newline(words):
    return words.replace('\n', '')

def remove_number(words, keyword=' NUMBER '):
    result = re.sub(r'\b\w*\d\w*\b', keyword, words)
    return result

def remove_currency(words, keyword=' CURRENCY '):
    currency_pattern = r'[£$€₹]'
    return re.sub(currency_pattern, keyword, words)

def remove_stop_words(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

def word_lemmatizer(words):
    return [lemmatizer.lemmatize(s) for s in words]

def join_words(words):
    return " ".join(words)

In [12]:
def preprocess_pipeline(sentence):
    preprocess_utils = [
                        remove_hyperlink,
                        replace_newline,
                        to_lower,
                        emoji_to_text,
                        remove_currency,
                        remove_number,
                        unidecode,
                        remove_punctuation,
                        remove_whitespace,
                    ]
    for func in preprocess_utils:
        sentence = func(sentence)
    return sentence

In [13]:
data_train_preprocess = [
    preprocess_pipeline(sent) for sent in texts_train_all.to_numpy()
]

In [None]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_preprocess,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

## Model

In [16]:
class SpamCharacterRNN(nn.Module):
    def __init__(self, num_tokens, embed_size, hidden_size, drop_p=0.5):
        super(SpamCharacterRNN, self).__init__()

        self.num_tokens = num_tokens
        self.hidden_size = hidden_size
        self.emded_size = embed_size

        self.encoder = nn.Embedding(num_tokens, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.dropout_embed = nn.Dropout(drop_p)
        self.dropout_rnn = nn.Dropout(drop_p)
        self.out = nn.Linear(hidden_size, 2)

    def forward(self, input):
        embedded = self.embedding(input)
        embedded = self.dropout_embed(embedded)
        _, hidden = self.rnn(embedded)
        hidden = self.dropout_rnn(hidden)
        output = self.out(hidden)
        return output


In [None]:
def train(model, optimizer, criterion, n_show = 50,
                train_history=[], valid_history=[], bleu_history=[]):
    model.train()

    epoch_loss = 0
    I_val = []
    epoch_history = []
    val_history = []
    nf_val_history = []
    bleu_scores = []
    for i, batch in tqdm.tqdm(enumerate(train_loader)):
        src = batch[0].to(DEVICE)
        trg = batch[1].to(DEVICE)

        optimizer.zero_grad()

        output = model(src, trg)
        output = output.view(-1, output.shape[-1])
        trg_output = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg_output)
        loss.backward()
        
        optimizer.step()

        epoch_loss += loss.item()

        epoch_history.append(loss.detach().cpu().data.numpy())
        
        if (i+1) % n_show == 0:
            I_val.append(i)
            val_history.append(evaluate(model, val_loader, criterion))
            
            bleu_score, _1, _2 = calc_bleu(model, val_loader)
            bleu_scores.append(bleu_score)
            
            model.train()
            
            fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
    
            line_11, = ax[0].plot(epoch_history, label='train loss')
            line_12, = ax[0].plot(I_val, val_history, label='val loss')
            # line_13, = ax[0].plot(I_val, nf_val_history, label='no force val loss')
            ax[0].set_xlabel('Batch')
            ax[0].set_title('Loss')
            ax[0].legend()
            
            if train_history != []:
                ax[1].plot(train_history, label='Train loss')
            if valid_history != []:
                ax[1].plot(valid_history, label='Valid loss')
            ax[1].legend()
            ax[1].set_xlabel('# of epoch')
            ax[1].set_title('Loss')
            
            fig.tight_layout()
            clear_output(True)
            plt.show()

    return epoch_loss / len(train_loader)

In [None]:
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    history = []

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src = batch[0].to(DEVICE)
            trg = batch[1].to(DEVICE)
            
            output = model(src)

            output = output.view(-1, output.shape[-1])
            trg_output = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg_output)

            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs