# Import Libraries

In [None]:
import collections

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

#!pip install datasets
#!pip install torchtext
import datasets
import string
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm
import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Set random seeds for reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

# Load Data

In [None]:
drive_path = "drive/MyDrive/ColabNotebooks/Dataset"
local_path = "../data/processed/"

df = pd.read_csv(f'{drive_path}/combined_df.csv', index_col="Unnamed: 0")
df.head()

Unnamed: 0,text,label
0,"Its like that, if you want or not.“ ME: I have...",0
1,I man the front desk and my title is HR Custom...,0
2,We'd be saving so much money with this new hou...,1
3,"My ex used to shoot back with ""Do you want me ...",1
4,I haven’t said anything to him yet because I’m...,0


# Text Preprocessing

In [None]:
STOPWORDS_TO_KEEP = { 'weren',
 "weren't" 'wasn',
 "wasn't",'no',
 'nor',
 'not','isn',
 "isn't",'hadn',
 "hadn't",
 'hasn',
 "hasn't",
 'haven',
 "haven't",'couldn',
 "couldn't",
 'didn',
 "didn't",}

In [None]:
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))
#STOPWORDS = STOPWORDS - STOPWORDS_TO_KEEP
def preprocessing(text):
  text = text.lower()
  text = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
  text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
  return text

df['clean_text'] = df['text'].apply(lambda text: preprocessing(text))
df.head()

Unnamed: 0,text,label,clean_text
0,"Its like that, if you want or not.“ ME: I have...",0,like want not“ problem takes longer asked frie...
1,I man the front desk and my title is HR Custom...,0,man front desk title hr customer service repre...
2,We'd be saving so much money with this new hou...,1,wed saving much money new housrits expensive c...
3,"My ex used to shoot back with ""Do you want me ...",1,ex used shoot back want go time matter almost ...
4,I haven’t said anything to him yet because I’m...,0,haven’t said anything yet i’m sure someone wou...


In [None]:
df['word_amount'] = df['clean_text'].apply(lambda text: len(text.split()))
df.head()

Unnamed: 0,text,label,clean_text,word_amount
0,"Its like that, if you want or not.“ ME: I have...",0,like want not“ problem takes longer asked frie...,26
1,I man the front desk and my title is HR Custom...,0,man front desk title hr customer service repre...,42
2,We'd be saving so much money with this new hou...,1,wed saving much money new housrits expensive c...,66
3,"My ex used to shoot back with ""Do you want me ...",1,ex used shoot back want go time matter almost ...,41
4,I haven’t said anything to him yet because I’m...,0,haven’t said anything yet i’m sure someone wou...,41


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Combine X_train and y_train into a single DataFrame
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
# Check the result
print(f"train size: {len(test_df)}")
print(f"valid size: {len(test_df)}")

train size: 3021
valid size: 3021


In [None]:
train_data = datasets.Dataset.from_pandas(train_df)
test_data = datasets.Dataset.from_pandas(test_df)
test_data

Dataset({
    features: ['clean_text', 'label', '__index_level_0__'],
    num_rows: 3021
})

# Tokenization

In [None]:
tokenizer = get_tokenizer("basic_english")

def tokenize_example(text, tokenizer, max_length):
    tokens = tokenizer(text["clean_text"])[:max_length]
    length = len(tokens)
    return {"tokens": tokens, "length": length}


max_length = 100

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

Map:   0%|          | 0/12080 [00:00<?, ? examples/s]

Map:   0%|          | 0/3021 [00:00<?, ? examples/s]

In [None]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

# Features Extraction

In [None]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab = build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)


In [None]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

In [None]:
vocab.set_default_index(unk_index)

import pickle

with open(f'{drive_path}/vocab.pkl', "wb") as f:
    pickle.dump(vocab, f)

In [None]:
len(vocab)

5715

In [None]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

In [None]:
train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

Map:   0%|          | 0/9060 [00:00<?, ? examples/s]

Map:   0%|          | 0/3020 [00:00<?, ? examples/s]

Map:   0%|          | 0/3021 [00:00<?, ? examples/s]

In [None]:
train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])

In [None]:
train_data[0]

{'label': tensor(1),
 'length': tensor(53),
 'ids': tensor([  16,   12,  238,  143,    0, 2080,    4,    3,    0, 1308,    0,  534,
          164, 1328,  112,  451,  212, 1923,   20,  115,    0,  411,   63, 1282,
            0,   23,  115, 1991,  172,   18,  175,   55,   71,  768,  413,    0,
           89,  120,  124, 2802, 3567,  499,    6,   86,    6,   81,  262,  110,
         3835,  202,   36,   43,  815])}

In [None]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_length = [i["length"] for i in batch]
        batch_length = torch.stack(batch_length)
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
        return batch

    return collate_fn

In [None]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [None]:
batch_size = 32

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [None]:
class LSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout_rate,
        pad_index,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=bidirectional,
            dropout=dropout_rate,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, ids, length):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))
        # embedded = [batch size, seq len, embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, length, batch_first=True, enforce_sorted=False
        )
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        # output = [batch size, seq len, hidden dim * n directions]
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
            # hidden = [batch size, hidden dim * 2]
        else:
            hidden = self.dropout(hidden[-1])
            # hidden = [batch size, hidden dim]
        prediction = self.fc(hidden)
        # prediction = [batch size, output dim]
        return prediction

In [None]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
output_dim = len(train_data.unique("label"))
n_layers = 2
bidirectional = True
dropout_rate = 0.5

model = LSTM(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout_rate,
    pad_index,
)

print(model)

5715
LSTM(
  (embedding): Embedding(5715, 300, padding_idx=1)
  (lstm): LSTM(300, 300, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=600, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [None]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if "bias" in name:
                nn.init.zeros_(param)
            elif "weight" in name:
                nn.init.orthogonal_(param)

In [None]:

model.apply(initialize_weights)

In [None]:
vectors = torchtext.vocab.GloVe()

pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())

model.embedding.weight.data = pretrained_embedding

In [None]:
lr = 0.0001

optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

In [None]:
device = torch.device("cuda")

device

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(dataloader, desc="training..."):
        ids = batch["ids"].to(device)
        length = batch["length"]
        label = batch["label"].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [None]:
def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            length = batch["length"]
            label = batch["label"].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [None]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [None]:
n_epochs = 10
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss, train_acc = train(
        train_data_loader, model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate(valid_data_loader, model, criterion, device)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "lstm.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

In [None]:
metrics['train_accs']

In [None]:
import matplotlib.pyplot as plt


# Plotting the accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(n_epochs), metrics['train_accs'], label='Training Accuracy')
plt.plot(range(n_epochs), metrics['valid_accs'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

In [None]:
import matplotlib.pyplot as plt


# Plotting the accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(n_epochs), metrics['train_losses'], label='Training Losses')
plt.plot(range(n_epochs), metrics['valid_losses'], label='Validation Losses')
plt.title('Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.legend()

In [None]:
torch.save(model.state_dict(), "drive/MyDrive/ColabNotebooks/Models/lstm.pt")

In [None]:
lstm_model = LSTM(vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout_rate,
    pad_index)
lstm_model.load_state_dict(torch.load("drive/MyDrive/ColabNotebooks/Models/lstm.pt"))
lstm_model.eval()
print(lstm_model)

RuntimeError: Error(s) in loading state_dict for LSTM:
	size mismatch for embedding.weight: copying a param with shape torch.Size([5786, 300]) from checkpoint, the shape in current model is torch.Size([5715, 300]).

In [None]:
test_text = "I am really happy and I love you"

input = pd.DataFrame([{'text': test_text}])
input['clean_text'] = input['text'].apply(lambda text: preprocessing(text))
input_data = datasets.Dataset.from_pandas(input)
input_data

In [None]:
input_data = input_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

In [None]:
input_data = input_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

In [None]:
input_data = input_data.with_format(type="torch", columns=["ids", "length"])

In [None]:
input_data_loader = get_data_loader(input_data, batch_size, pad_index, shuffle=True)

In [None]:
final_input = input_data_loader.dataset[0]
final_input

In [None]:
ids = final_input["ids"].to('cpu')
length = final_input["length"].to('cpu')
length = torch.tensor([length], dtype=torch.int64)
ids = ids.unsqueeze(0)
length

In [None]:
prediction = lstm_model(ids, length)

prediction

In [None]:
def get_label(prediction):
    predicted_classes = prediction.argmax(dim=-1)
    return predicted_classes.item()

get_label(prediction)

In [None]:
def predict_stress(model, text):
  input = pd.DataFrame([{'text': text}])
  input['clean_text'] = input['text'].apply(lambda text: preprocessing(text))
  input_data = datasets.Dataset.from_pandas(input)

  input_data = input_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
  )

  input_data = input_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

  input_data = input_data.with_format(type="torch", columns=["ids", "length"])

  input_data_loader = get_data_loader(input_data, batch_size, pad_index, shuffle=True)

  final_input = input_data_loader.dataset[0]

  ids = final_input["ids"].to(device)
  length = final_input["length"]
  length = torch.tensor([length], dtype=torch.int64)
  ids = ids.unsqueeze(0)

  prediction = model(ids, length)

  return get_label(prediction)

In [None]:
predict_stress(lstm_model, test_text)

In [None]:
with open(f'{drive_path}/vocab.pkl', "wb") as f:
    pickle.dump(vocab, f)

with open(f'{drive_path}/tokenizer.pkl', "wb") as f:
    pickle.dump(tokenizer, f)