## Import Required Packages

In [1]:
import re
import os
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
stop = set(stopwords.words("english"))
wnl = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load the Dataset

In [3]:
import torch  # pytorch

# import required torchtext modules
from torchtext import data
from torch.autograd import Variable
from torchtext.vocab import GloVe

# animated progress bar
from tqdm.notebook import tqdm

# load the datasets as dataframes
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

# concat training and testing datasets
df = pd.concat([train, test])

## Data Preprocessing

In [4]:
def remove_URL(text):
    """Remove URLs from text.

    Args:
        text (str): Text to remove URLs from.

    Returns:
        str: Text with URLs removed.
    """
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub("", text)


def remove_html(text):
    """Remove HTML tags from text.

    Args:
        text (str): Text to remove HTML tags from.

    Returns:
        str: Text with HTML tags removed.
    """
    html = re.compile(r"<.*?>")
    return html.sub("", text)


def remove_emoji(text):
    """Remove emojis from text.

    Args:
        text (str): Text to remove emojis from.

    Returns:
        str: Text with emojis removed.
    """
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )

    return emoji_pattern.sub(r"", text)


def remove_punct(text):
    """Remove punctuation from text.

    Args:
        text (str): Text to remove punctuation from.

    Returns:
        str: Text with punctuation removed.
    """
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)


def lemmatize_sentence(sentence):
    """Lemmatize words in a sentence.

    Args:
        sentence (str): Sentence to lemmatize.

    Returns:
        str: Lemmatized sentence.
    """
    wnl = WordNetLemmatizer()
    sentence_words = sentence.split(" ")
    new_sentence_words = []
    for sentence_word in sentence_words:
        sentence_word = sentence_word.replace("#", "")
        new_sentence_word = wnl.lemmatize(sentence_word.lower(), wordnet.VERB)
        new_sentence_words.append(new_sentence_word)
    new_sentence = " ".join(new_sentence_words)
    return new_sentence.strip()

In [5]:
# Apply the remove_URL function to the 'text' column of the dataframe
df["text"] = df["text"].apply(lambda x: remove_URL(x))

# Apply the remove_html function to the 'text' column of the dataframe
df["text"] = df["text"].apply(lambda x: remove_html(x))

# Apply the remove_emoji function to the 'text' column of the dataframe
df["text"] = df["text"].apply(lambda x: remove_emoji(x))

# Apply the remove_punct function to the 'text' column of the dataframe
df["text"] = df["text"].apply(lambda x: remove_punct(x))

# Apply the lemmatize_sentence function to the 'text' column of the dataframe
df["text"] = df["text"].apply(lambda x: lemmatize_sentence(x))

In [6]:
def prepare_csv(df_train, df_test, seed=27, val_ratio=0.3):
    """
    Split the train set into train and validation set and prepare csv files for the dataset.

    Args:
    - df_train (DataFrame): The train set as pandas DataFrame.
    - df_test (DataFrame): The test set as pandas DataFrame.
    - seed (int): Random seed used to shuffle the train set.
    - val_ratio (float): The validation set size ratio.

    Returns:
    None.
    """
    # Shuffle the train set.
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)

    # Compute the validation set size.
    val_size = int(len(idx) * val_ratio)

    # Create the cache directory if it doesn't exist.
    if not os.path.exists("cache"):
        os.makedirs("cache")

    # Save the train set into a csv file.
    df_train.iloc[idx[val_size:], :][["id", "target", "text"]].to_csv(
        "cache/dataset_train.csv", index=False
    )

    # Save the validation set into a csv file.
    df_train.iloc[idx[:val_size], :][["id", "target", "text"]].to_csv(
        "cache/dataset_val.csv", index=False
    )

    # Save the test set into a csv file.
    df_test[["id", "text"]].to_csv("cache/dataset_test.csv", index=False)


def get_iterator(dataset, batch_size, train=True, shuffle=True, repeat=False):
    """
    Create a torchtext iterator for the specified dataset.

    Args:
    - dataset: The dataset to iterate over.
    - batch_size (int): The batch size.
    - train (bool): Whether the iterator is used for training or not.
    - shuffle (bool): Whether to shuffle the data or not.
    - repeat (bool): Whether to repeat the iterator for multiple epochs or not.

    Returns:
    A torchtext iterator over the dataset.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    return data.Iterator(
        dataset,
        batch_size=batch_size,
        device=device,
        train=train,
        shuffle=shuffle,
        repeat=repeat,
        sort=False,
    )


def get_dataset(fix_length=100, lower=False, vectors=None):
    """
    Load and prepare the dataset for training and testing the model
    Args:
        fix_length (int): the maximum length of each text
        lower (bool): whether to convert all text to lowercase or not
        vectors: pretrained word embeddings, default is None

    Returns:
        tuple: TEXT (data.field.Field), vocab_size (int), word_embeddings (torch.Tensor),
            train_iter (torchtext.data.Iterator), val_iter (torchtext.data.Iterator),
            test_iter (torchtext.data.Iterator)
    """
    if vectors is not None:
        lower = True

    # Prepare the CSV files
    prepare_csv(train, test)

    # Define the fields
    TEXT = data.field.Field(
        sequential=True,
        lower=lower,
        include_lengths=True,
        batch_first=True,
        fix_length=fix_length,
    )
    LABEL = data.field.Field(use_vocab=True, sequential=False, dtype=torch.float16)
    ID = data.Field(use_vocab=False, sequential=False, dtype=torch.float16)

    # Load the datasets
    train_temp, val_temp = data.TabularDataset.splits(
        path="cache/",
        format="csv",
        skip_header=True,
        train="dataset_train.csv",
        validation="dataset_val.csv",
        fields=[("id", ID), ("target", LABEL), ("text", TEXT)],
    )

    test_temp = data.TabularDataset(
        path="cache/dataset_test.csv",
        format="csv",
        skip_header=True,
        fields=[("id", ID), ("text", TEXT)],
    )

    # Build the vocabulary
    TEXT.build_vocab(
        train_temp,
        val_temp,
        test_temp,
        max_size=20000,
        min_freq=10,
        vectors=GloVe(name="6B", dim=300),
    )

    LABEL.build_vocab(train_temp)

    ID.build_vocab(train_temp, val_temp, test_temp)

    # Get the iterators for training, validation, and testing
    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)
    train_iter = get_iterator(
        train_temp, batch_size=32, train=True, shuffle=True, repeat=False
    )
    val_iter = get_iterator(
        val_temp, batch_size=32, train=True, shuffle=True, repeat=False
    )
    test_iter = get_iterator(
        test_temp, batch_size=32, train=False, shuffle=False, repeat=False
    )
    return TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter

In [7]:
# assigns the outputs of the get_dataset() function to the variables
TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter = get_dataset()

## Long Short-Term Memory

In [8]:
class LSTMClassifier(torch.nn.Module):
    def __init__(
        self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, weights
    ):
        """
        Initialize the LSTMClassifier model.

        Args:
        - vocab_size (int): The size of the vocabulary.
        - output_size (int): The size of the output layer.
        - embedding_dim (int): The dimensionality of the word embeddings.
        - hidden_dim (int): The number of units in the hidden layer.
        - n_layers (int): The number of LSTM layers.
        - weights (torch.Tensor): The pre-trained word embeddings.

        Returns:
        - None
        """
        super(LSTMClassifier, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = torch.nn.Parameter(weights, requires_grad=False)
        self.dropout_1 = torch.nn.Dropout(0.3)
        self.lstm = torch.nn.LSTM(
            embedding_dim, hidden_dim, n_layers, dropout=0.3, batch_first=True
        )
        self.dropout_2 = torch.nn.Dropout(0.3)
        self.label_layer = torch.nn.Linear(hidden_dim, output_size)
        self.act = torch.nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of the LSTMClassifier model.

        Args:
        - x (torch.Tensor): The input tensor of shape (batch_size, seq_len).
        - hidden (tuple(torch.Tensor)): The hidden state and cell state of the LSTM.

        Returns:
        - out (torch.Tensor): The output tensor of shape (batch_size, output_size).
        - hidden (tuple(torch.Tensor)): The hidden state and cell state of the LSTM.
        """
        batch_size = x.size(0)
        x = self.word_embeddings(x)
        x = self.dropout_1(x)
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout_2(lstm_out)
        out = self.label_layer(out)
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1, :]
        out = self.act(out)
        return out, hidden

    def init_hidden(self, batch_size):
        """
        Initialize the hidden state and cell state of the LSTM.

        Args:
        - batch_size (int): The size of the batch.

        Returns:
        - hidden (tuple(torch.Tensor)): The hidden state and cell state of the LSTM.
        """
        weight = next(self.parameters()).data

        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(
            device
        ), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)

## Model Training

In [9]:
def train_model(model, train_iter, val_iter, optim, loss, num_epochs, batch_size=32):
    """
    Train the LSTM classifier model.

    Args:
        model: The LSTMClassifier model to train.
        train_iter: The training data iterator.
        val_iter: The validation data iterator.
        optim: The optimizer to use for training.
        loss: The loss function to use for training.
        num_epochs: The number of epochs to train for.
        batch_size: The batch size to use for training. Default is 32.

    Returns:
        total_train_epoch_loss: A list containing the training loss for each epoch.
        total_train_epoch_acc: A list containing the training accuracy for each epoch.
        total_val_epoch_loss: A list containing the validation loss for each epoch.
        total_val_epoch_acc: A list containing the validation accuracy for each epoch.
    """
    h = model.init_hidden(batch_size)
    clip = 5
    val_loss_min = np.Inf
    total_train_epoch_loss = []
    total_train_epoch_acc = []
    total_val_epoch_loss = []
    total_val_epoch_acc = []
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    for epoch in range(num_epochs):
        model.train()
        train_epoch_loss = []
        train_epoch_acc = []
        val_epoch_loss = []
        val_epoch_acc = []
        for _, batch in enumerate(tqdm(train_iter)):
            h = tuple([e.data for e in h])
            text = batch.text[0]
            target = batch.target
            target = target - 1
            target = target.type(torch.LongTensor)
            text = text.to(device)
            target = target.to(device)
            optim.zero_grad()

            if text.size()[0] is not batch_size:
                continue

            prediction, h = model(text, h)
            loss_train = loss(prediction.squeeze(), target)
            loss_train.backward()
            num_corrects = (
                (torch.max(prediction, 1)[1].view(target.size()).data == target.data)
                .float()
                .sum()
            )

            acc = 100.0 * num_corrects / len(batch)
            train_epoch_loss.append(loss_train.item())
            train_epoch_acc.append(acc.item())
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optim.step()

        print(
            f"Train Epoch: {epoch}, Training Loss: {np.mean(train_epoch_loss):.4f}, Training Accuracy: {np.mean(train_epoch_acc): .2f}%"
        )

        model.eval()

        with torch.no_grad():
            for _, batch in enumerate(tqdm(val_iter)):
                val_h = tuple([e.data for e in h])
                text = batch.text[0]
                target = batch.target
                target = target - 1
                target = target.type(torch.LongTensor)
                text = text.to(device)
                target = target.to(device)

                if text.size()[0] is not batch_size:
                    continue

                prediction, h = model(text, h)
                loss_val = loss(prediction.squeeze(), target)
                num_corrects = (
                    (
                        torch.max(prediction, 1)[1].view(target.size()).data
                        == target.data
                    )
                    .float()
                    .sum()
                )

                acc = 100.0 * num_corrects / len(batch)
                val_epoch_loss.append(loss_val.item())
                val_epoch_acc.append(acc.item())

            print(
                f"Vadlidation Epoch: {epoch}, Training Loss: {np.mean(val_epoch_loss):.4f}, Training Accuracy: {np.mean(val_epoch_acc): .2f}%"
            )

            if np.mean(val_epoch_loss) <= val_loss_min:
                print(
                    "Validation loss decreased ({:.6f} --> {:.6f})".format(
                        val_loss_min, np.mean(val_epoch_loss)
                    )
                )

                val_loss_min = np.mean(val_epoch_loss)

        total_train_epoch_loss.append(np.mean(train_epoch_loss))
        total_train_epoch_acc.append(np.mean(train_epoch_acc))
        total_val_epoch_loss.append(np.mean(val_epoch_loss))
        total_val_epoch_acc.append(np.mean(val_epoch_acc))

    return (
        total_train_epoch_loss,
        total_train_epoch_acc,
        total_val_epoch_loss,
        total_val_epoch_acc,
    )

In [None]:
lr = 1e-4
batch_size = 32
output_size = 2
hidden_size = 128
embedding_length = 300

model = LSTMClassifier(
    vocab_size=vocab_size,
    output_size=output_size,
    embedding_dim=embedding_length,
    hidden_dim=hidden_size,
    n_layers=2,
    weights=word_embeddings,
)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)
loss = torch.nn.CrossEntropyLoss()

train_loss, train_acc, val_loss, val_acc = train_model(
    model=model,
    train_iter=train_iter,
    val_iter=val_iter,
    optim=optim,
    loss=loss,
    num_epochs=20,
    batch_size=batch_size,
)

## Predictions

In [13]:
results_target = []
with torch.no_grad():
    for batch in tqdm(test_iter):
        for text, idx in zip(batch.text[0], batch.id):
            text = text.unsqueeze(0)
            res, _ = model(text, hidden=None)
            target = np.round(res.cpu().numpy())
            results_target.append(target[0][1])

  0%|          | 0/102 [00:00<?, ?it/s]