In [380]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(rc={"figure.figsize":(10, 6)})

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [381]:
# Null data has been handled in previous version
data_file_path = "/content/drive/MyDrive/Colab Notebooks/NLP/Classification/Nepali Sentimental Classification/not_null_sentimental_data.csv"
data = pd.read_csv(data_file_path)

In [382]:
data.sample(3)

Unnamed: 0.1,Unnamed: 0,Data,Label
1674,1675,हाम्रो घरमा पनि टिभि छ,2
444,445,राम्रो भन्नी ठान्नु,1
203,204,ठूलो गल्ती छ,2


In [383]:
data.drop(labels = ["Unnamed: 0"], axis = 1, inplace = True)

In [384]:
data.head()

Unnamed: 0,Data,Label
0,यो समान राम्रो रहेछ,1
1,समान राम्रो रहेछ,1
2,राम्रो रहेछ,1
3,यो घडी मलाइ साँच्चिकै सुहाउछ । म यसलाई खरीद गर...,1
4,साँच्चिकै सुहाउछ,1


In [385]:
data[data.isnull().any(axis=1)]

Unnamed: 0,Data,Label


In [386]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2186 entries, 0 to 2185
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2186 non-null   object
 1   Label   2186 non-null   object
dtypes: object(2)
memory usage: 34.3+ KB


In [387]:
data["Label"] = data["Label"].astype(int)

In [388]:
data.Label.value_counts()

2    770
0    717
1    699
Name: Label, dtype: int64

# Text Preprocessing

In [389]:
!pip install nepalitokenizer
!pip install nepali-stemmer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [390]:
from nepali_stemmer.stemmer import NepStemmer
from nepalitokenizer import NepaliTokenizer
nepstem = NepStemmer()
tokenize = NepaliTokenizer()

def text_preprocessing(text, nepali_stopwords = stopwords.words('nepali')):
  # Tokenize the reviews
  text = tokenize.tokenizer(text)
  # Remove the nepali stopwords
  text = [word for word in text if word not in nepali_stopwords]
  text = ' '.join(text)
  # Stemming the nepali words
  text = nepstem.stem(text)
  # Remove the leading and trailing spaces
  text = text.split()
  text = ' '.join(text)  
  return text

In [391]:
X = data["Data"].apply(text_preprocessing)
y = data["Label"] 

### Problem : ramro is removed during tokenization in preprocessing

In [392]:
X[:5]

0                                              समान
1                                              समान
2                                                  
3    घडी मलाइ साँच्चि कै सुहाउछ यस लाई खरीद गर्नेछु
4                                 साँच्चि कै सुहाउछ
Name: Data, dtype: object

# Train, Validation and Test Split

In [393]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 101)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 101)

In [394]:
print(f"Train size =  {len(X_train)}")
print(f"Validation size =  {len(X_val)}")
print(f"Test size =  {len(X_test)}")

Train size =  1748
Validation size =  219
Test size =  219


In [395]:
print(len(y_train), len(y_val), len(y_test))

1748 219 219


# One Hot Encoding

In [396]:
UNIQUE_WORD_COUNT = 3179
MAX_PAD_LENGTH = 10

In [397]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = UNIQUE_WORD_COUNT)
tokenizer.fit_on_texts(X_train)

In [398]:
train_sequences = tokenizer.texts_to_sequences(X_train)
print(train_sequences[0])
train_padded = pad_sequences(train_sequences, maxlen = MAX_PAD_LENGTH, padding = 'post', truncating = 'post')
print(train_padded[0])

[2028, 7, 542, 252, 543]
[2028    7  542  252  543    0    0    0    0    0]


In [399]:
val_sequences = tokenizer.texts_to_sequences(X_val)
print(val_sequences[0])
val_padded = pad_sequences(val_sequences, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")
print(val_padded[0])

[2516, 66, 40]
[2516   66   40    0    0    0    0    0    0    0]


In [400]:
test_sequences = tokenizer.texts_to_sequences(X_test)
print(test_sequences[0])
test_padded = pad_sequences(test_sequences, maxlen = MAX_PAD_LENGTH, padding = "post", truncating = "post")
print(test_padded[0])

[20, 2517, 407, 794, 2518]
[  20 2517  407  794 2518    0    0    0    0    0]


In [401]:
y_train = pd.get_dummies(y_train).values
y_val = pd.get_dummies(y_val).values
y_test = pd.get_dummies(y_test).values

In [402]:
y_train

array([[0, 0, 1],
       [0, 1, 0],
       [0, 0, 1],
       ...,
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]], dtype=uint8)

# Data Loader

In [403]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

torch.__version__

'1.13.0+cu116'

In [404]:
train_data = TensorDataset(torch.LongTensor(train_padded), torch.FloatTensor(y_train))
valid_data = TensorDataset(torch.LongTensor(val_padded), torch.FloatTensor(y_val))
test_data = TensorDataset(torch.LongTensor(test_padded), torch.FloatTensor(y_test))

BATCH_SIZE = 32
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid_data, batch_size = BATCH_SIZE, shuffle = False, drop_last = True)
test_loader = DataLoader(test_data, batch_size = BATCH_SIZE, shuffle = False, drop_last = True)

In [405]:
(next(iter(train_loader)))

[tensor([[1554, 2519,   64,    0,    0,    0,    0,    0,    0,    0],
         [  29,    0,    0,    0,    0,    0,    0,    0,    0,    0],
         [1655,    4,    0,    0,    0,    0,    0,    0,    0,    0],
         [ 508,    0,    0,    0,    0,    0,    0,    0,    0,    0],
         [1376, 1377, 1378, 1958, 1959,    0,    0,    0,    0,    0],
         [ 369, 3166, 3167, 3168,  378,   45, 1020, 1021,    0,    0],
         [  99,   55, 1982, 1983,  887,    1,   57, 3038,  575,    0],
         [1353,   34,  472,  473,    0,    0,    0,    0,    0,    0],
         [ 863,   29, 1801,   15,  864,   17,    2, 1004,    0,    0],
         [ 755, 1701,   28,  144, 2637,  145,   78,    0,    0,    0],
         [ 403,  777,   17,    0,    0,    0,    0,    0,    0,    0],
         [ 253, 2406,    0,    0,    0,    0,    0,    0,    0,    0],
         [  63, 2805,    1,  929,  554,    0,    0,    0,    0,    0],
         [ 366,    1,  262,  263,    0,    0,    0,    0,    0,    0],
      

# Model

In [406]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [482]:
class SentimentalModelV3(nn.Module):
  def __init__(self, output_size, vocab_size, embedding_dim, hidden_dim, n_layers = 1, drop_prob = 0.3, bidirectional = False, batch_size = 32, padded_seq_len = 10):
    super().__init__()
    self.batch_size = batch_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    self.padded_seq_len = padded_seq_len

    self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim)
    self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim, num_layers = n_layers, dropout = drop_prob, batch_first = True, bidirectional = bidirectional)

    self.dropout = nn.Dropout(0.3)

    #Linear and activation layer
    self.fc1=nn.Linear(self.hidden_dim * self.padded_seq_len, 64)
    self.fc2=nn.Linear(64, 16)
    self.fc3=nn.Linear(16,output_size)
    self.Relu = nn.ReLU()

  def forward(self, one_hot, hn, cn):
    embed = self.embedding(one_hot)
    lstm_out, hidden = self.lstm(embed)

    #stack up the lstm output
    lstm_out = lstm_out.reshape(shape = (self.batch_size, lstm_out.shape[1] * lstm_out.shape[2]))
    # dropout and fully connected layers
    out = self.dropout(lstm_out)
    out = self.Relu(out)
    out = self.Relu(self.fc1(out))
    out = self.Relu(self.fc2(out))
    out = self.fc3(out)

    return out

  def initCellState(self):
    h =  torch.zeros(self.n_layers , self.batch_size , self.hidden_dim).to(device)
    c =  torch.zeros(self.n_layers , self.batch_size , self.hidden_dim).to(device)
    return h, c

In [483]:
model_v3 = SentimentalModelV3(output_size = 3, 
                              vocab_size = UNIQUE_WORD_COUNT, 
                              embedding_dim = 64, 
                              hidden_dim = 32, 
                              n_layers = 5, 
                              drop_prob = 0.3, 
                              padded_seq_len = 10).to(device)

In [484]:
model_v3
hn, cn = model_v3.initCellState()

In [485]:
hn.shape

torch.Size([5, 32, 32])

In [486]:
cn.shape

torch.Size([5, 32, 32])

In [487]:
X.shape

torch.Size([32, 10])

In [488]:
X, y = next(iter(train_loader))
model_v3.eval()
with torch.inference_mode():
  pred = model_v3(X, hn, cn)

pred.shape

torch.Size([32, 3])

In [489]:
embedding = nn.Embedding(num_embeddings = UNIQUE_WORD_COUNT, embedding_dim = 64)
embed = embedding(X)
print(f"From Embedding layer output: {embed.shape}")

lstm = nn.LSTM(input_size = 64, hidden_size = 32, num_layers = 2, dropout = 0.3, batch_first = True, bidirectional = False)
lstm_out, (hn, cn) = lstm(embed)
print(f"lstm output shape : {lstm_out.shape}")
lstm_out = lstm_out.reshape(shape = (32, 32 * 10))
print(lstm_out.shape)

fc1=nn.Linear(in_features = 320, out_features = 64)
fc1(lstm_out).shape

From Embedding layer output: torch.Size([32, 10, 64])
lstm output shape : torch.Size([32, 10, 32])
torch.Size([32, 320])


torch.Size([32, 64])

# Training and Testing loop

In [490]:
def train_step(model: torch.nn.Module, 
               dataloader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module, 
               optimizer: torch.optim.Optimizer):
  hn, cn = model.initCellState()
  # put the model in train mode
  model.train()

  # setup the train loss and train accuracy values
  train_loss, train_acc  = 0, 0

  # Loop through the data loader data batches
  for batch, (X, y) in enumerate(dataloader):
    # send data to target device
    X, y = X.to(device), y.to(device)

    # Forward pass
    y_pred = model(X, hn, cn)

    # calculate and accumulate loss
    loss = loss_fn(y_pred, y)
    train_loss += loss.item()

    # optimzer zero grad
    optimizer.zero_grad()

    # Loss backward
    loss.backward()

    # optimizer step
    optimizer.step()

    # calculate the accumulate accuracy metric across all batches
    y_pred_class = torch.argmax(torch.softmax(y_pred, dim = 1), dim = 1)
    y_true_class = torch.argmax(torch.softmax(y, dim = 1), dim = 1)
    train_acc += (y_pred_class == y_true_class).sum().item() / len(y_pred)
  
  train_loss = train_loss / len(dataloader)
  train_acc = train_acc / len(dataloader)

  return train_loss, train_acc


In [491]:
def test_step(model: torch.nn.Module, 
              dataloader: torch.utils.data.DataLoader, 
              loss_fn: torch.nn.Module):
  hn, cn = model.initCellState()
  model.eval()
  test_loss, test_acc = 0, 0
  with torch.inference_mode():
    for batch, (X, y) in enumerate(dataloader):
      X, y = X.to(device), y.to(device)

      # forward pass
      test_pred_logits = model(X, hn, cn)

      # calculate and accumulate loss
      loss = loss_fn(test_pred_logits, y)
      test_loss += loss.item()

      # calculate and accumulate accuracy
      y_pred_class = torch.argmax(torch.softmax(test_pred_logits, dim = 1), dim = 1)
      y_true_class = torch.argmax(torch.softmax(y, dim = 1), dim = 1)
      test_acc += (y_pred_class == y_true_class).sum().item() / len(test_pred_logits)

  test_loss = test_loss / len(dataloader)
  test_acc = test_acc / len(dataloader)
  return test_loss, test_acc


In [492]:
from tqdm.auto import tqdm

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer, 
          loss_fn: torch.nn.Module, 
          epochs: int = 5):
  results = {
      "train_loss" : [],
      "train_acc" : [], 
      "test_loss" : [], 
      "test_acc" : []
  }

  for epoch in tqdm(range(epochs)):
    train_loss, train_acc = train_step(model = model, 
                                       dataloader = train_dataloader, 
                                       loss_fn = loss_fn, 
                                       optimizer = optimizer)
    
    test_loss, test_acc = test_step(model = model, 
                                    dataloader = test_dataloader, 
                                    loss_fn = loss_fn)
    
    print(
        f"Epoch: {epoch+1} | "
        f"train_loss: {train_loss:.4f} | "
        f"train_acc: {train_acc:.4f} | "
        f"test_loss: {test_loss:.4f} | "
        f"test_acc: {test_acc:.4f}"
    )

    results["train_loss"].append(train_loss)
    results["train_acc"].append(train_acc)
    results["test_loss"].append(test_loss)
    results["test_acc"].append(test_acc)
  return results

In [493]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

NUM_EPOCHS = 5

model_v3 = SentimentalModelV3(output_size = 3, 
                              vocab_size = UNIQUE_WORD_COUNT, 
                              embedding_dim = 64, 
                              hidden_dim = 32, 
                              n_layers = 5, 
                              drop_prob = 0.3, 
                              padded_seq_len = 10).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model_v3.parameters(), lr=0.001)

In [494]:
model_v3_results = train(model = model_v3, train_dataloader = train_loader, test_dataloader = test_loader, optimizer = optimizer, loss_fn = loss_fn, epochs = 15)

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.0988 | train_acc: 0.3490 | test_loss: 1.0957 | test_acc: 0.3438
Epoch: 2 | train_loss: 1.0977 | train_acc: 0.3524 | test_loss: 1.0974 | test_acc: 0.3438
Epoch: 3 | train_loss: 1.0810 | train_acc: 0.3854 | test_loss: 1.1136 | test_acc: 0.3385
Epoch: 4 | train_loss: 1.0186 | train_acc: 0.4664 | test_loss: 1.1109 | test_acc: 0.4271
Epoch: 5 | train_loss: 0.9313 | train_acc: 0.5174 | test_loss: 1.1209 | test_acc: 0.4115
Epoch: 6 | train_loss: 0.8797 | train_acc: 0.5463 | test_loss: 1.1629 | test_acc: 0.4167
Epoch: 7 | train_loss: 0.7855 | train_acc: 0.6105 | test_loss: 1.2699 | test_acc: 0.4323
Epoch: 8 | train_loss: 0.7172 | train_acc: 0.6291 | test_loss: 1.2820 | test_acc: 0.4844
Epoch: 9 | train_loss: 0.6819 | train_acc: 0.6476 | test_loss: 1.2999 | test_acc: 0.4583
Epoch: 10 | train_loss: 0.6383 | train_acc: 0.6696 | test_loss: 1.4460 | test_acc: 0.4844
Epoch: 11 | train_loss: 0.5966 | train_acc: 0.7043 | test_loss: 1.4596 | test_acc: 0.5156
Epoch: 12 | train_l