In [None]:
!pip install transformers



In [None]:
# imports 
import re
import numpy as np
import torch
import pandas as pd

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = "cpu"
print(device)

cpu


In [None]:
import os
import pandas as pd

directory = '/content/hindi_coref_data'

datasets = []
for file in os.listdir(directory):
    if file.endswith(".csv"):
      df = pd.read_csv(os.path.join(directory, file))
      datasets.append(df)

print(len(datasets))

275


In [None]:
data = datasets[10]
data.columns = ["word", "cref", "crefHead", "acrefmod", "acrefmodHead", "crefmod", "creftype", "Chainhead"]
print(data)

            word     cref        crefHead acrefmod acrefmodHead crefmod  \
0             के        _               _        _            _       _   
1    मुख्यमंत्री  i2%1:t2  मुख्यमंत्री:i2        _            _   m1:i2   
2        नरेंद्र  i3%0:t2               _        _            _       _   
3           मोदी  i3%1:t2         मोदी:i3        _            _       _   
4             के        _               _        _            _       _   
..           ...      ...             ...      ...          ...     ...   
344           का        _               _        _            _       _   
345       नुकसान        _               _        _            _       _   
346            न        _               _        _            _       _   
347       पहुंचे        _               _        _            _       _   
348            ।        _               _        _            _       _   

                          creftype Chainhead  
0                                _         _  
1    

In [None]:
from transformers import AutoModel, AutoTokenizer

# load the murilbert model
path = 'google/muril-base-cased'

tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path,
                                  output_hidden_states=True # Whether the model returns all hidden-states.
                                  ) 
model.to(device)

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(197285, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [None]:
def prepare_data(data, start):
  # data is taken sentence wise, and stored in the texts array
  texts = []
  text = ""
  tags = []
  input_tokens = []

  for i in range(start, len(data)):
    if str(data["word"][i]) == "।" :
      text += str(data["word"][i])
      texts.append(text)
      input_tokens += (tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
      text = ""
    else:
      text += str(data["word"][i]) + " "
      tags.append(data["cref"][i])

  return texts, tags, input_tokens

def get_word_vectors(texts):
  " function to get hindi word vectors from murilbert "

  outputs = []

  for text in texts:
    # encoded input with input ids, token type ids and attention mask
    input_encoded = tokenizer.encode_plus(text, return_tensors="pt")
    # input_encoded.to(device)

    # obtain and take the sum of all 13 states of BERT output
    with torch.no_grad():
            states = model(**input_encoded).hidden_states

    output = torch.stack([states[i] for i in range(len(states))]).sum(dim = 0)
    output = output.squeeze()
    outputs.append(output)

  return torch.cat(outputs, dim = 0)

In [None]:
# do this file wise
def map_mentions(data, start):
  " function to make a list of mentions, along with a list of their corresponding cluster ids"
  mentions = []
  mention_ids = [1]
  mention = ""
  count = 1
  for i in range(start, len(data)):
    tag = data["cref"][i]
    if (tag[0] == "i"):
      idx = (re.search('i(\d*)%', tag)).group(1)
      mention_idx = (re.search('t(.*)', tag)).group(1) 
      if int(idx) == count:
        mention += str(data["word"][i]) + " "
      else:
        mentions.append(mention)
        mention_ids.append(mention_idx)
        mention = str(data["word"][i]) + " "
        count += 1

  return mentions, mention_ids

In [None]:
def getvec(output, mention):
  " function to get the vector of a mention, it takes the average of all word vectors in the mention "
  vec = torch.zeros(768)
  tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(mention))
  count = 1
  for i in range(1, len(tokens) - 1):
    try: 
      idx = input_tokens.index(tokens[i])
    except:
      idx = 1
    # print(idx)
    vec = vec.to(device)
    vec = torch.add(vec, output[idx].to(device))
    count += 1
  
  return torch.div(vec, count).to(device)

In [None]:
def make_mention_pairs(embeds, mentions, mention_ids):
  " function to make a list of mention pairs and their true/false values "
  x_train = torch.empty(0).to(device)
  y_train = torch.empty(0).to(device)

  for i in range(len(mentions)):
    for j in range(i + 1, len(mentions)):
      x_train = torch.cat((x_train, (getvec(embeds, mentions[i]) + getvec(embeds, mentions[j]))) , 0)
      if mention_ids[i] == mention_ids[j]:
        y_train = torch.cat((y_train, torch.tensor([1]).to(device)))
      else:
        y_train = torch.cat((y_train, torch.tensor([0]).to(device)))

  x_train = x_train.reshape(-1, 768)

  return x_train, y_train

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

class Lstm(nn.Module):
    def __init__(self):
        super(Lstm, self).__init__()
        self.embedding_dim = 768
        self.num_layers = 2

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.embedding_dim,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        
    def forward(self, x, prev_state):
        embed = x
        output, state = self.lstm(embed, prev_state)
        return output, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers,1, self.embedding_dim),
                torch.zeros(self.num_layers,1, self.embedding_dim))

In [None]:
#defining the network
from torch import nn
from torch.nn import functional as F

class BC_Model(nn.Module):
  def __init__(self,input_shape):
    super(BC_Model,self).__init__()
    self.fc1 = nn.Linear(input_shape,32)
    self.fc2 = nn.Linear(32,64)
    self.fc3 = nn.Linear(64,1)  
  
  def forward(self,x): 
    # print(x.shape)
    x = torch.relu(self.fc1(x))
    x = torch.relu(self.fc2(x))
    x = torch.sigmoid(self.fc3(x))
    return x

In [None]:
#defining dataset class
from torch.utils.data import Dataset, DataLoader
class dataset(Dataset):
  def __init__(self,x,y):
    self.x = torch.tensor(x,dtype=torch.float32)
    self.y = torch.tensor(y,dtype=torch.float32)
    self.length = self.x.shape[0]
 
  def __getitem__(self,idx):
    return self.x[idx],self.y[idx]  

  def __len__(self):
    return self.length

In [None]:
#hyper parameters
learning_rate = 0.01
epochs = 2

# Models , Optimizer, Loss
lstm = Lstm().to(device)
bc_model = BC_Model(input_shape=768).to(device)
optimizer1 = torch.optim.SGD(bc_model.parameters(), lr=learning_rate)
optimizer2 = optim.Adam(lstm.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

In [None]:

  # mentions, mention_ids = map_mentions(data, 0)
  # x, y = make_mention_pairs(bert_embeddings, mentions, mention_ids)
  # print(x.shape, y.shape)
  # trainset = dataset(x,y)
  # trainloader = DataLoader(trainset,batch_size=64,shuffle=False)

In [None]:
def train(data, embeds, epochs):
  lstm.train()
  bc_model.train()
  
  mentions, mention_ids = map_mentions(data, 0)
  running_loss = 0
  prev_loss = 0
  for epoch in range(epochs):
    optimizer1.zero_grad()
    optimizer2.zero_grad()
    # get the lstm embeddings 
    state_h, state_c = lstm.init_state(0)
    outputs = []
    # loop over all the embeddings and pass them through the lstm
    for i in range(len(embeds)):
      # print(i, embeds[i][:2])
      y_pred, (state_h, state_c) = lstm(embeds[i].view(1, 1, -1), (state_h, state_c))
      outputs.append(state_h[0][0])
  
    # outputs contains the lstm embeds, use these to get the new embeds and pass them through the binary classifier
    running_loss = 0

    # train loader should have output embeds from lstm
    
    x, y = make_mention_pairs(outputs, mentions, mention_ids)
    trainset = dataset(x,y)
    trainloader = DataLoader(trainset,batch_size=64,shuffle=False)

    for j,(x_train,y_train) in enumerate(trainloader):
    
      #calculate output
      output = bc_model(x_train)
  
      #calculate loss
      loss = loss_fn(output,y_train.reshape(-1,1))

      #backprop
      loss.backward()
      optimizer1.step()
      optimizer2.step()

      running_loss += loss.item()

    avg_loss = running_loss/ len(trainloader)
    print("epoch {}\tloss : {}".format(epoch,avg_loss))

In [None]:
for data in datasets[1:2]:
  data.columns = ["word", "cref", "crefHead", "acrefmod", "acrefmodHead", "crefmod", "creftype", "Chainhead"]
  texts, tags, input_tokens = prepare_data(data, 0)
  # print(len(tags))
  bert_embeddings = get_word_vectors(texts)
  print(bert_embeddings.shape)
  train(data, bert_embeddings, 5)

torch.Size([524, 768])


  """
  


epoch 0	loss : 0.17533620144240558
epoch 1	loss : 0.16641665160256838
epoch 2	loss : 0.16221322411937372
epoch 3	loss : 0.1494382724132655
epoch 4	loss : 0.16034334113854648


## Testing

In [None]:
def lstm_output(embeds, lstm, data):
  lstm.eval()

  state_h, state_c = lstm.init_state(0)
  outputs = []
  # loop over all the embeddings and pass them through the lstm
  for i in range(len(embeds)):
    y_pred, (state_h, state_c) = lstm(embeds[i].view(1, 1, -1), (state_h, state_c))
    outputs.append(state_h[0][0])
    
  return outputs

In [None]:
x_test = torch.empty(0).to(device)
y_test = torch.empty(0).to(device)

for data2 in datasets[223:224]:
  try:
    data2.columns = ["word", "cref", "crefHead", "acrefmod", "acrefmodHead", "crefmod", "creftype", "Chainhead"]

    texts, tags, input_tokens = prepare_data(data2, 0)

    # print(len(texts), texts)
    embeds = get_word_vectors(texts)
    print(embeds.shape)

    mentions, mention_ids = map_mentions(data2, 0)
    outputs = lstm_output(embeds, lstm, data2)
    print(len(outputs))
    x, y = make_mention_pairs(outputs, mentions, mention_ids)
    print(x.shape, y.shape)

    x_test = torch.cat((x_test, x.to(device)))
    y_test = torch.cat((y_test, y.to(device)))
  except:
    continue

x_test = x_test.reshape(-1, 768)

torch.Size([423, 768])
423
torch.Size([1891, 768]) torch.Size([1891])


In [None]:
testset = dataset(x_test,y_test)
testloader = DataLoader(testset,batch_size=1,shuffle=False)

  """
  


In [None]:
predicted_vals = []
actual_vals = []
bc_model.eval()

for i,(x_test,y_test) in enumerate(testloader):
  
  #calculate output
  output = bc_model(x_test)
  print(output)
  if output >= 0.028:
    predicted_vals.append(1)
  else:
    predicted_vals.append(0)
  actual_vals.append( int(y_test.item()) )

predicted_vals = np.array(predicted_vals)
actual_vals = np.array(actual_vals)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# predicted_vals
# actual_vals
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(actual_vals, predicted_vals)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(actual_vals, predicted_vals, average='weighted')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(actual_vals, predicted_vals, average='weighted')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(actual_vals, predicted_vals, average='weighted')
print('F1 score: %f' % f1)
 
# confusion matrix
matrix = confusion_matrix(actual_vals, predicted_vals)
print(matrix)   

Accuracy: 0.899524
Precision: 0.809144
Recall: 0.899524
F1 score: 0.851943
[[1701    0]
 [ 190    0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(roc_auc_score(actual_vals, predicted_vals))

0.5


In [None]:
torch.save(bc_model, "hin_coref_lstm")