## Prediction on CNN and LSTM Bidirectional Model

In [2]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tqdm import tqdm
import nltk

In [4]:
df_train = pd.concat([pd.read_csv("../Dataset/Kirk_Dataset/train.csv"), pd.read_csv("../Dataset/Kirk_Dataset/validation.csv"),pd.read_csv("../Dataset/Kirk_Dataset/test.csv")])
df_train = df_train.reset_index(drop=True)
df_test = pd.read_csv("../Dataset/Annotate_Dataset/comments_annotated.csv")

In [5]:
# extract X and Y for training from Krik dataset
X_train = df_train["text"]
Y_train = df_train["label_gold"]

In [6]:
# convert label in test set to index (Hated==1 and Non-Hated==0)
Y_test = []
X_test = df_test["Comment"]
for i in df_test["Tag_Nalin"]:
  if i == "Hated":
    Y_test.append(1)
  else:
    Y_test.append(0)

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/srunnalin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# apply word tokenize using nltk
import collections
from nltk import word_tokenize
token2int = collections.defaultdict(lambda: len(token2int)+1)
tokenized_X_train = [word_tokenize(text) for text in X_train]
tokenized_X_test = [word_tokenize(text) for text in X_test]
int_X_train = [[token2int[token.lower()] for token in text] for text in tokenized_X_train]
int_X_test = [[token2int[token.lower()] for token in text] for text in tokenized_X_test]
# ##

In [9]:
len(token2int)

6220

In [10]:
int2token = dict()
for key, value in token2int.items():
    int2token[value] = key

## Vocabulary Size:
print("The Vocabulary size is:",len(token2int))

## Maximum text length
X_train_lengths = [len(text) for text in tokenized_X_train]
X_test_lengths = [len(text) for text in tokenized_X_test]
print("The maximum text length:", max(X_train_lengths))

The Vocabulary size is: 6220
The maximum text length: 55


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# convert dataset to tensor
max_len = 55

X_train = torch.zeros(len(X_train), max_len, dtype=torch.long)

for i, int_text in enumerate(int_X_train):
    if len(int_text) < max_len:
        int_text = int_text + [len(token2int)] * (max_len - len(int_text))

    X_train[i] = torch.LongTensor(int_text[:max_len])


Y_train = torch.LongTensor(Y_train)

print(X_train.size())
print(Y_train.size())

X_test = torch.zeros(len(X_test), max_len, dtype=torch.long)

for i, int_text in enumerate(int_X_test):
    if len(int_text) < max_len:
        int_text = int_text + [len(token2int)] * (max_len - len(int_text))

    X_test[i] = torch.LongTensor(int_text[:max_len])


Y_test = torch.LongTensor(Y_test)


print(X_test.size())
print(Y_test.size())

torch.Size([5908, 55])
torch.Size([5908])
torch.Size([200, 55])
torch.Size([200])


In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [40]:
import fasttext
import fasttext.util
# Load pre-trained FastText word embeddings
pretrained_embeddings = fasttext.load_model('../Model/fasttext/cc.en.50.bin')

# Create the embedding matrix
embeddings = torch.zeros(len(token2int), 50).to(device)
for token, idx in token2int.items():
    if token in pretrained_embeddings.words:
      embeddings[idx] = torch.tensor(pretrained_embeddings[token])

# Check for out-of-vocabulary tokens and assign random embeddings
for token, idx in token2int.items():
    if idx >= len(embeddings):
        if token != '':
          new_embedding = torch.rand(1, 50).to(device)
          embeddings = torch.cat([embeddings, new_embedding])



In [41]:
# CNN Model
import torch.nn.functional as F
import fasttext
import fasttext.util
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes, pretrained_embeddings):
        super().__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_embeddings)
        self.conv1 = nn.Conv1d(embed_size, hidden_size, kernel_size=2)
        self.dropout = nn.Dropout(.3)
        self.decision = nn.Linear(hidden_size, num_classes)


    def forward(self, x):
        embed = self.embed(x)
        conv1 = F.relu(self.conv1(embed.transpose(1,2)))
        pool = F.max_pool1d(conv1, conv1.size(2))
        drop = self.dropout(pool)
        return self.decision(drop.view(x.size(0), -1))
    
cnn_model = CNN(vocab_size = len(token2int), embed_size =50, 
                hidden_size = 32, num_classes=len(X_train.unique()),  pretrained_embeddings=embeddings)
# Set the embedding layer to non-trainable
cnn_model.embed.weight.requires_grad = False

In [22]:
# LSTM Model
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes,  pretrained_embeddings):
        super().__init__()
        
        self.embed = nn.Embedding.from_pretrained(pretrained_embeddings)
        
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers=2, bidirectional=True, batch_first=True)  # Replaced nn.GRU with nn.LSTM
        
        self.dropout = nn.Dropout(0.3)
        
        self.decision = nn.Linear(hidden_size, num_classes)
         
    def forward(self, x):
        x = x.to(device)
        embed = self.embed(x)
        
        output, (hidden, _) = self.rnn(embed)  # Unpack both hidden and cell state from LSTM
        
        drop = self.dropout(hidden[-1])  # Only consider the final hidden state
        
        return self.decision(drop.view(x.size(0), -1))


lstm_model = RNN(vocab_size=len(token2int) + 1, embed_size=32, hidden_size=64, num_classes=len(Y_train.unique()),  pretrained_embeddings=embeddings)
lstm_model.to(device)
    # Set the embedding layer to non-trainable
lstm_model.embed.weight.requires_grad = False

## Load Model and Make Prediction

In [42]:
## CNN model
import torch

# Load model
state_dict = torch.load("../Model/deep_model/cnn_1lay_50emb_32hidd_63acc.pt")

# Load the state into the model
cnn_model.load_state_dict(state_dict)

# Set the model to evaluation mode
cnn_model.eval()

CNN(
  (embed): Embedding(6221, 50)
  (conv1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
  (dropout): Dropout(p=0.3, inplace=False)
  (decision): Linear(in_features=32, out_features=5854, bias=True)
)

In [26]:
## LSTM model
import torch

# Load model
state_dict = torch.load("../Model/deep_model/lstm_2lay_32emb_64hidd_65acc.pt", map_location=torch.device('cpu'))

# Load the state into the model
lstm_model.load_state_dict(state_dict)

# Set the model to evaluation mode
lstm_model.eval()

RNN(
  (embed): Embedding(6221, 32)
  (rnn): LSTM(32, 64, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (decision): Linear(in_features=64, out_features=2, bias=True)
)

In [43]:
from nltk import word_tokenize
sentence = "When they REALLY want you to advertise"
# Tokenize the new sentence
tokenized_new_sentence = word_tokenize(sentence)

# Apply tokenization using existing integer tokens
int_tokens = []
int_X_eval = []
for token in tokenized_new_sentence:
    if token.lower() in token2int:
        int_tokens.append(token2int[token.lower()])
    else:
        # Handle out-of-vocabulary (OOV) tokens
        int_tokens.append(0)  # You can assign a specific integer token for OOV tokens
int_X_eval.append(int_tokens)
int_X_eval
tokenized_new_sentence

['When', 'they', 'REALLY', 'want', 'you', 'to', 'advertise']

In [44]:
# convert dataset to tensor
max_len = 55

X_eval = torch.zeros(1, max_len, dtype=torch.long)

for i, int_text in enumerate(int_X_eval):
    if len(int_text) < max_len:
        int_text = int_text + [len(token2int)] * (max_len - len(int_text))

    X_eval[i] = torch.LongTensor(int_text[:max_len])


In [45]:
## Test CNN model
print("The Sentence is :", sentence)
with torch.no_grad():
  y_scores = cnn_model(X_eval)

  y_pred = torch.max(y_scores, 1)[1]
  print("The Prediction is :", y_pred)

The Sentence is : When they REALLY want you to advertise
The Prediction is : tensor([0])


In [39]:
## Test LSTM model
print("The Sentence is :", sentence)
with torch.no_grad():
  y_scores = lstm_model(X_eval)

  y_pred = torch.max(y_scores, 1)[1]
  print("The Prediction is :", y_pred)

The Sentence is : When they REALLY want you to advertise
The Prediction is : tensor([0])


In [185]:
## Test CNN model
print("The Sentence is :", sentence)
with torch.no_grad():
  y_scores = cnn_model(X_eval)

  y_pred = torch.max(y_scores, 1)[1]
  print("The Prediction is :", y_pred)
# Find the index of the token with the highest weight
  max_weight_index = torch.argmax(y_scores)

  # Check if the index is present in the token2int dictionary
  if max_weight_index.item() in token2int.values():
      # Retrieve the corresponding token from token2int
      max_weight_token = list(token2int.keys())[list(token2int.values()).index(max_weight_index)]
  else:
      max_weight_token = "Unknown Token"

# Print the token with the highest weight
print("Token with the highest weight:", max_weight_token)

The Sentence is : When they REALLY don't want you to advertise 😂😂😂
The Prediction is : tensor([0])
Token with the highest weight: Unknown Token
