In [None]:
pip install torch torchvision torchtext

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import re
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
file_path = 'Shuffled_Dataset.xlsx'
df = pd.read_excel(file_path)


In [3]:
df.head()

Unnamed: 0,Sentence,Emotion
0,ನಾನು ಒಳ್ಳೆಯದನ್ನು ಮೆಚ್ಚುತ್ತೇನೆ,Joy
1,ಶಾಲಾ ಬಸ್ ನಿಲ್ದಾಣದಲ್ಲಿ ಮಕ್ಕಳು ಬಸ್‌ಗಾಗಿ ನಿರೀಕ್ಷಿ...,Neutral
2,ನನ್ನ ಹಳೆಯ ಮಿತ್ರನು ನನ್ನ ಕೈ ಹಿಡಿದಾಗ ನನಗೆ ಅನುಭವವಾ...,Surprise
3,ನಾನು ಅದನ್ನು ಅನುಭವಿಸುತ್ತೇನೆ ಮತ್ತು ನಾನು ಅತೃಪ್ತಿ ...,Sad
4,ಈ ಹೋಟೆಲ್‌ನ ಆಹಾರದಲ್ಲಿ ಕೀಟಗಳು ಕಂಡುಬಂದಿವೆ.,Disgust


In [4]:
# Manually defined list of common Kannada stopwords
stop_words = set([
    'ನಾನು', 'ಅದು', 'ಅವರು', 'ಮತ್ತು', 'ಈ', 'ಇದು', 'ಎಂದು', 'ಆ', 'ಅದೇ', 'ಇದನ್ನು',
    'ನಾವು', 'ಅದನ್ನು', 'ನಿನ್ನ', 'ನನಗೆ', 'ಅವನು', 'ಅವಳು', 'ನ', 'ನಿಮ್ಮ', 'ಅವಳ', 'ಅವನ',
    'ನನ್ನ', 'ಮಾಡಲು', 'ಮಾಡಿದ', 'ಮತ್ತು', 'ಅದರ'
])

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^\w\s\u0C80-\u0CFF]', '', text)  # Retain Kannada characters
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


# Apply preprocessing to the text column
df['Sentence'] = df['Sentence'].apply(preprocess_text)
df.head()

Unnamed: 0,Sentence,Emotion
0,ಒಳ್ಳೆಯದನ್ನು ಮೆಚ್ಚುತ್ತೇನೆ,Joy
1,ಶಾಲಾ ಬಸ್ ನಿಲ್ದಾಣದಲ್ಲಿ ಮಕ್ಕಳು ಬಸ್ಗಾಗಿ ನಿರೀಕ್ಷಿಸ...,Neutral
2,ಹಳೆಯ ಮಿತ್ರನು ಕೈ ಹಿಡಿದಾಗ ಅನುಭವವಾಯಿತು,Surprise
3,ಅನುಭವಿಸುತ್ತೇನೆ ಅತೃಪ್ತಿ ಹೊಂದಿದ್ದೇನೆ,Sad
4,ಹೋಟೆಲ್ನ ಆಹಾರದಲ್ಲಿ ಕೀಟಗಳು ಕಂಡುಬಂದಿವೆ,Disgust


In [None]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Emotion'])

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Custom Dataset class
class KannadaDataset(Dataset):
    def __init__(self, df):
        self.texts = df['Sentence'].values
        self.labels = df['label'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return text, label

In [None]:
# Define hyperparameters
VOCAB_SIZE = 20000
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
OUTPUT_DIM = len(df['label'].unique())
BATCH_SIZE = 16
EPOCHS = 10

# Tokenization and padding
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence

tokenizer = get_tokenizer("basic_english")



In [None]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

train_texts = train_df['Sentence'].tolist()
vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

def text_pipeline(x):
    return vocab(tokenizer(x))

def collate_batch(batch):
    text_list, label_list = [], []
    for _text, _label in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(torch.tensor(_label, dtype=torch.int64))
    text_list = pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    return text_list, torch.tensor(label_list, dtype=torch.int64)

train_dataset = KannadaDataset(train_df)
val_dataset = KannadaDataset(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [None]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        hidden = lstm_out[:, -1, :]
        output = self.fc(hidden)
        return output

In [None]:
# Initialize the model, loss function, and optimizer
model = LSTMModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTMModel(
  (embedding): Embedding(20000, 128)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=7, bias=True)
)

In [None]:
def train_model(model, train_dataloader, val_dataloader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for text, labels in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}', leave=False):
            text, labels = text.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(text)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch+1}, Training loss: {avg_train_loss}')

        model.eval()
        eval_loss = 0
        correct = 0

        with torch.no_grad():
            for text, labels in val_dataloader:
                text, labels = text.to(device), labels.to(device)
                outputs = model(text)
                loss = criterion(outputs, labels)
                eval_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                correct += (preds == labels).sum().item()

        avg_val_loss = eval_loss / len(val_dataloader)
        val_accuracy = correct / len(val_dataloader.dataset)
        print(f'Validation loss: {avg_val_loss}, Validation accuracy: {val_accuracy}')

# Train the model
train_model(model, train_dataloader, val_dataloader, criterion, optimizer, epochs=EPOCHS)



Epoch 1, Training loss: 1.8297066601839933
Validation loss: 1.5977769759764153, Validation accuracy: 0.29309035687167806




Epoch 2, Training loss: 1.3788450685414402
Validation loss: 1.1620666140533356, Validation accuracy: 0.5603644646924829




Epoch 3, Training loss: 0.8820079630071467
Validation loss: 1.0040655969137169, Validation accuracy: 0.6416097190584662




Epoch 4, Training loss: 0.5478330430207831
Validation loss: 1.0189934678106423, Validation accuracy: 0.6492027334851936




Epoch 5, Training loss: 0.3177739474470868
Validation loss: 1.1395107633378132, Validation accuracy: 0.6484434320425209




Epoch 6, Training loss: 0.16929214933153355
Validation loss: 1.2932757392346141, Validation accuracy: 0.6279422930903569




Epoch 7, Training loss: 0.08913660313736535
Validation loss: 1.5917679730309062, Validation accuracy: 0.6400911161731208




Epoch 8, Training loss: 0.05092725032679454
Validation loss: 1.6357535798865628, Validation accuracy: 0.6438876233864844




Epoch 9, Training loss: 0.028253485100118048
Validation loss: 1.7859411097793694, Validation accuracy: 0.6522399392558846




Epoch 10, Training loss: 0.022569836874026805
Validation loss: 1.5939096406281712, Validation accuracy: 0.6651480637813212
