# STEP 2: Convert Text ‚Üí Numbers (Tokenization & Vocabulary)

---


Neural Network can not understand the text it only understans the numbers

In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("data/training.csv")
val_df = pd.read_csv("data/validation.csv")

## Define a Simple Tokenizer
Below function will do this :
I am Tirth Patel  -- > ['i', 'am', 'tirth', 'patel']

In [None]:
import re
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

print(tokenize("I am Tirth Patel"))

## Build Vocabulary from Training Data

üìå Why special tokens?
- `<PAD>` : for padding
- `<UNK>` : unseen words in test data

In [None]:
from collections import Counter

word_counter = Counter()

for sentance in train_df["text"]:
    tokens = tokenize(sentance)
    word_counter.update(tokens)


# Special tokens
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

vocab = {
    PAD_TOKEN: 0,
    UNK_TOKEN: 1
}

for word,_ in word_counter.items():
     vocab[word] = len(vocab)

print("Vocabulary size:",len(vocab))

## Convert Text to Numerical Sequences

In [None]:
def encode_sentence(sentance , vocab) :
     tokens = tokenize(sentance)
     return [vocab.get(token, vocab[UNK_TOKEN]) for token in tokens]

print(encode_sentence("I feel happy", vocab))

## Padding (VERY IMPORTANT)
Neural networks need same-length inputs.

In [None]:
MAX_LEN = 50

def pad_suquence(seq , max_len) :
     if len(seq) < max_len:
        return seq + [vocab[PAD_TOKEN]] * (max_len - len(seq))
     else:
          return seq[:max_len]

## Final Encoded Dataset (Training Only)

In [None]:
X_train = [
     pad_suquence(encode_sentence(text , vocab) , MAX_LEN)
     for text in train_df['text']
]

y_train = train_df['label'].values
print(X_train[0])
print(len(X_train[0]))

## What You‚Äôve Achieved
Implemented custom tokenization, vocabulary construction, sequence encoding, and padding for NLP tasks.

# STEP 3: PyTorch Dataset & DataLoader (Industry Standard)
‚ùó Goal of this step
> Convert your processed data into a format that PyTorch models can train on.
---
## Why Dataset & DataLoader?
Instead of loading everything at once, PyTorch:
- Loads data in batches
- Shuffles training data
- Works efficiently on CPU / GPU

## Convert Data to PyTorch Tensors

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

## Create a Custom Dataset Class


In [None]:
class EmotionDataset(Dataset) :
     def __init__(self , texts , labels) :
          self.texts = torch.tensor(texts, dtype=torch.long)
          self.labels = torch.tensor(labels ,dtype=torch.long )

     def __len__(self) :
          return len(self.labels)

     def __getitem__(self , idx) :
          return self.texts[idx], self.labels[idx]

## Create Training & Validation Datasets

In [None]:
train_dataset = EmotionDataset(X_train, y_train)

In [None]:
X_val = [
    pad_suquence(encode_sentence(text,vocab), MAX_LEN)
    for text in val_df["text"]
]

y_val = val_df["label"].values
val_dataset = EmotionDataset(X_val, y_val)

## Create DataLoaders

In [None]:
BATCH_SIZE = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

## Sanity Check (IMPORTANT)

In [None]:
for batch_texts, batch_labels in train_loader:
    print(batch_texts.shape)
    print(batch_labels.shape)
    break

# STEP 4: Build the Deep Neural Network (Embedding + LSTM)

## Key Hyperparameters

In [None]:
VOCAB_SIZE = len(vocab)
EMBED_DIM = 128
HIDDEN_DIM = 128
NUM_CLASSES = 6

## Define the Model (PyTorch)

In [None]:
import torch.nn as nn

class EmotionLSTM(nn.Module) :
    def __init__(self ,vocab_size , embed_dim , hidden_dim , num_classes) :
        super().__init__()

        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self , x ) :
        emb = self.emb(x)
        _, (hidden, _) = self.lstm(emb)
        out = self.fc(hidden[-1])
        return out

model = EmotionLSTM(
    VOCAB_SIZE,
    EMBED_DIM,
    HIDDEN_DIM,
    NUM_CLASSES
)


## Sanity Check (CRITICAL)

In [None]:
for texts, labels in train_loader:
    outputs = model(texts)
    print(outputs.shape)
    break


# üü¢ STEP 5: Training the Model (Learning Happens Here)

In [None]:
import torch.optim as optim

loss_fn = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

EPOCHS = 15
for i in range(EPOCHS) :
    model.train()
    total_loss = 0

    for texts,labels in train_loader :
        texts = texts.to(device)
        labels = labels.to(device)

        opt.zero_grad()

        output = model(texts)
        loss = loss_fn(output , labels)
        loss.backward()
        opt.step()

        total_loss += loss.item()

    print(f"Epoch {i+1}/{EPOCHS}, Loss: {total_loss/len(train_loader)}")

## Validation Loop (VERY IMPORTANT)

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for texts, labels in val_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        output = model(texts) # Corrected: 'ouput' to 'output'
        predictions = torch.argmax(output, dim=1) # Corrected: used 'output' instead of global 'outputs'

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy * 100} %")

# 2Ô∏è‚É£ Test on Real Human Sentences (MOST IMPORTANT)

In [None]:
def predict_emotion(text , model , vocab ) :
    model.eval()

    encoded = encode_sentence(text, vocab)
    padded = pad_suquence(encoded, MAX_LEN) # Corrected: pad_sequence to pad_suquence

    input_tensor = torch.tensor([padded], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        predicted_class = torch.argmax(output, dim=1).item()

    return predicted_class

print(predict_emotion("I feel lonely and exhausted", model, vocab))
# print(predict_emotion("Today is the best day of my life", model, vocab))
print(predict_emotion("I feel like a dog", model, vocab))
print(predict_emotion("I am feeling very happy today and everything is going great.", model, vocab))
print(predict_emotion("This is the best moment of my life, I cannot stop smiling.", model, vocab))
print(predict_emotion("I feel empty and tired, nothing seems to matter anymore.", model, vocab))
print(predict_emotion("This is so unfair, I cannot tolerate this anymore.", model, vocab))
