In [None]:
import os
import time
import torch
import pickle
import datetime
import numpy as np
import warnings

import pandas as pd
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device == 'cuda':
  print("=================================")
  print("GPU found")
  print("Using GPU at cuda:",torch.cuda.current_device())
  print("=================================")
  print(" ")

In [None]:
model1 = torch.load("~/NNTI-WS2021-NLP-Project/Project_files/Hindi/epoch_300.pt")

w1 = model1["w1.weight"].T
w2 = model1["w2.weight"]

cleandata = pd.read_pickle("~/NNTI-WS2021-NLP-Project/Project_files/Hindi/hindi_corpus_cleaned.pkl")
word_index = pd.read_pickle("~/NNTI-WS2021-NLP-Project/Project_files/Hindi/word_index.pkl")
index_word = pd.read_pickle("~/NNTI-WS2021-NLP-Project/Project_files/Hindi/index_word.pkl")
V = pd.read_pickle("~/NNTI-WS2021-NLP-Project/Project_files/Hindi/vocab.pkl")

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/SouravDutta91/NNTI-WS2021-NLP-Project/main/data/hindi_hatespeech.tsv',sep='\t')
text = data[['text','task_1']]
text['text'] = cleandata['text'].apply(lambda x: x.split())
text['label'] = text['task_1'].apply(lambda x: 1 if x == 'HOF' else 0)
max_len = text.text.str.len().max()

In [None]:
def tag_count(input):
  hcount,ncount = 0,0
  for tag in input:
    if tag == 1:
      hcount+=1
    else:
      ncount+=1
  return hcount,ncount

word_index['<pad>'] = len(V)
index_word[len(V)] = '<pad>'


def get_word_embedding(input):
  index = word_index[input]
  return w1[index]

def encode(corpus):
  sent_idx = []
  i = 0
  for sentence in corpus:
    sentence += ['<pad>'] * (max_len - len(sentence))
    idx = [word_index[word] for word in sentence]
    sent_idx.append(idx)
    i+= 1
  return np.array(sent_idx)


from itertools import islice
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [None]:
_ , emb_size = w1.shape

def matrix_embeddings():
  embedding_matrix = np.random.uniform(-0.25, 0.25, (len(word_index), emb_size))
  embedding_matrix[word_index['<pad>']] = np.zeros((emb_size,))

  for word,i in take(len(V),word_index.items()):
    temp = get_word_embedding(word)
    if temp is not None:
        embedding_matrix[i] = temp.cpu()

  return embedding_matrix

encoded_text = encode(text.text)
labels = np.array(text['label'])
embeds = torch.tensor(matrix_embeddings())

In [None]:
df = pd.DataFrame({'encoded':list(encoded_text), 'label':labels})

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(encoded_text,labels,shuffle=True,test_size=0.1,random_state=45)

In [None]:
batch_size = 10

from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,SequentialSampler)

def get_dataloader(traindata,trainlabels,testdata,testlabels,batchsize):
   
   traindata = torch.tensor(traindata).float()
   trainlabels = torch.tensor(trainlabels)
   testdata = torch.tensor(testdata).float()
   testlabels = torch.tensor(testlabels)

   test = TensorDataset(testdata,testlabels)
   test_dataload = DataLoader(test,sampler=RandomSampler(test),batch_size=batchsize,drop_last=True)
   train = TensorDataset(traindata,trainlabels)
   train_dataload = DataLoader(train,sampler=RandomSampler(train),batch_size=batchsize,drop_last=True)

   return train_dataload,test_dataload

train_dataload,test_dataload = get_dataloader(xtrain, ytrain,xtest,ytest,batch_size)

In [None]:
class lstmmodel(nn.Module):
  def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
    super().__init__()
    
    self.output_size = output_size
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    
    #Pretrained Embeddings
    self.embedding = nn.Embedding.from_pretrained(embeds.type(torch.float32),freeze=True)
    #LSM Layer
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
    
    #Dropout Layer
    self.dropout = nn.Dropout(0.3)
    
    #Fully connected Layer
    self.fc = nn.Linear(hidden_dim, output_size)
    
    #sigmoid Layer
    self.sig = nn.Sigmoid()

  def forward(self, x, hidden):
    batch_size = x.size(0)
    x = x.long()
    embed = self.embedding(x)
    lstm_out, hidden = self.lstm(embed.type(torch.float32), hidden)
    lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
    out = self.dropout(lstm_out)
    out = self.fc(out)
    sig_out = self.sig(out)

    sig_out = sig_out.view(batch_size, -1)
    sig_out = sig_out[:, -1]

    return sig_out, hidden
  
  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data

    hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), 
              weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())

      
    return hidden

In [None]:
vocab_size = embeds.shape[0]
output_size = 2
embedding_dim = embeds.shape[1]
hidden_dim = 256
n_layers = 1

model = lstmmodel(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(model)

In [None]:
# loss and optimization functions
lr=0.05
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
epochs = 10
counter = 0 
print_every = 50
clip = 5

model.cuda()

model.train()
for e in range(epochs):
  tot =  []
  h = model.init_hidden(batch_size)
  for inputs, labels in train_dataload:

    counter += 1
    
    inputs, labels = inputs.type(torch.float64).cuda(), labels.cuda()
    h = tuple([each.data for each in h])
    
    model.zero_grad()
    
    output, h = model(inputs, h)
    loss = criterion(output.squeeze(), labels.float())
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), clip)
    optimizer.step()
    tot.append(loss.item())
    if counter % print_every == 0:
      print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()))
plt.plot(tot)

In [None]:
test_losses = []
num_correct = 0


h = model.init_hidden(10)
model.eval()

for inputs, labels in test_dataload:
  h = tuple([each.data for each in h])
  
  inputs, labels = inputs.cuda(), labels.cuda()
  
  output, h = model(inputs, h)
  test_loss = criterion(output.squeeze(), labels.float())
  test_losses.append(test_loss.item())
  pred = torch.round(output.squeeze())
  correct_tensor = pred.eq(labels.float().view_as(pred))
  correct = np.squeeze(correct_tensor.cpu().numpy())
  num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))

test_acc = (num_correct)/len(test_dataload.dataset) * 100
print("Test accuracy: {:.3f}".format(test_acc))