In [None]:
!pip install scikit-learn numpy

In [15]:
import json
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [16]:
def data_reading(filepath):
  max_len = 0
  data_chunk, data_postags = [],[]
  with open(filepath, 'r') as file:
      for item in file:
          sent = json.loads(item)
          max_len = max(len(sent['tokens']),max_len)
          data_chunk.append(sent['chunk_tags'])
          data_postags.append(sent['pos_tags'])
  return data_chunk,  data_postags, max_len

In [17]:
def data_processing(data_chunk,  data_postags, m_sen_len):
  data_postags_final = []
  data_postags_final_pad = []
  data_postags_pad = []
  data_chunk_pad = []
  for i in range(len(data_postags)):
      sentence = []
      sentence_pad = []
      prev_word_oh = [1,0,0,0,0]
      data_postags_pad.append(data_postags[i]+ [4]*(m_sen_len - len(data_postags[i])))
      data_chunk_pad.append(data_chunk[i] + [1]*(m_sen_len - len(data_chunk[i])))
      for j in range(len(data_postags_pad[i])):
          word_oh = [0]*5
          word_oh[data_postags_pad[i][j]] = 1
          sentence.append(prev_word_oh + word_oh[1:])
          prev_word_oh = word_oh


      for j in range(len(data_postags[i])):
          word_oh = [0]*5
          word_oh[data_postags[i][j]] = 1
          sentence_pad.append(prev_word_oh + word_oh[1:])
          prev_word_oh = word_oh

      data_postags_final_pad.append(sentence)
      data_postags_final.append(sentence_pad)


  data_postags_final_pad = np.array(data_postags_final_pad)
  data_chunk_pad = np.array(data_chunk_pad)

  return data_postags_final, data_postags_final_pad, data_chunk_pad

In [18]:
def paramterinit():
  np.random.seed(1)
  W = np.random.uniform(-1,1,(1,9))
  bias = np.random.uniform(-1,1,(1,1))
  W_feedb = np.random.uniform(-1,1,(1,1))
  return W, bias, W_feedb

In [19]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [20]:
def sigmoid_derivative(z):
    return sigmoid(z)*(1 - sigmoid(z))

In [21]:
def forward(W, bias, W_feedb, X, max_sen_len):

   yhat = np.zeros((len(X), max_sen_len))
   sent_dim = len(X[0,:,0])

   for i in range(sent_dim):
      if i == 0:
         yhat_prev = np.zeros((len(X),1))
         yhat = (sigmoid(np.dot(W,X[:,i,:].T) + np.dot(W_feedb,yhat_prev.T) + bias)).T
      else:
         yhat_prev = sigmoid( np.dot( W , X[:,i,:].T ) + np.dot(W_feedb,yhat_prev.T) + bias).T
         yhat = np.concatenate((yhat,yhat_prev), axis =1)

   return yhat

In [22]:
def parameterupdate(W, W_feedb, bias, dW, dW_feedb, dbias, N, lr):

  W -= lr*dW/N
  W_feedb -= lr*dW_feedb/N
  bias -= lr*dbias/N

  return W, W_feedb, bias

In [23]:
def BPTT(W, bias, W_feedb, X, y, lr, max_sen_len):

  N = len(X)
  dW = np.zeros_like(W)
  dW_feedb = np.zeros_like(W_feedb)
  dbias = np.zeros_like(bias)

  yhat = forward(W, bias, W_feedb,X,max_sen_len)

  yhat_prev = np.zeros_like(yhat)
  yhat_prev[:,1:] = yhat[:,:-1]
  y = np.array(y)

  for i in range(max_sen_len):
      if i == 0:
          delta_W = X[:,i,:]
          delta_W_feedb = np.zeros((N,1))
          delta_bias = np.ones((N,1))
      else:
          prev = yhat_prev[:,i].reshape((len(X),1))
          delta_W = X[:,i,:] + np.dot(W_feedb, sigmoid_derivative(prev).T).T * delta_W
          delta_bias = np.ones((len(X),1)) + W_feedb* sigmoid_derivative(prev)*delta_bias
          delta_W_feedb = prev + W_feedb* sigmoid_derivative(prev) * delta_W_feedb


      dl = -(y[:,i] - yhat[:,i])
      dW += np.dot(delta_W.T, dl)
      dW_feedb += np.dot(delta_W_feedb.T,dl)
      dbias += np.dot(delta_bias.T, dl)

  W, W_feedb, bias = parameterupdate(W, W_feedb, bias, dW, dW_feedb, dbias, N, lr)
  return W, W_feedb, bias

In [24]:
def predict(W, bias, W_feedb, sent_data, max_sen_len):
    pred = []
    for i in range(len(sent_data)):
        input_sent = np.array([sent_data[i]])
        y_pred = forward(W, W_feedb, bias, input_sent, max_sen_len )
        y_pred = np.where(y_pred <= 0.5, 0, 1).tolist()[0]
        pred.append(y_pred)
    return pred


In [25]:
def compute_metric(output, predicted):
  accuracy = accuracy_score(output, predicted)*100
  precision = precision_score(output, predicted)
  recall = recall_score(output, predicted)
  f1 = f1_score(output, predicted)
  return accuracy, precision, recall, f1


In [26]:
def testing(chunks_test, postags_test, max_sen_len, param):

  test_postags_final, test_postags_final_pad, test_chunk_pad = data_processing(chunks_test, postags_test, max_sen_len)
  W_final = param[str(1)][0]
  W_feedb_final = param[str(1)][1]
  bias_final = param[str(1)][2]
  test_prediction = predict(W_final, W_feedb_final, bias_final, test_postags_final, max_sen_len)

  test_prediction = np.array([one_token for one_sent in test_prediction for one_token in one_sent])
  y_test_observed = np.array([one_token for one_sent in chunks_test for one_token in one_sent])
  accuracy, precision, recall, f1 = compute_metric(y_test_observed, test_prediction)
  print("Test set metric: \n")
  print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n')


In [27]:
def model(train_path,test_path, lr, epoch, batch_size, num_splits):

  chunks_train, postags_train, max_len_train = data_reading(train_path)
  chunks_test, postags_test, max_len_test = data_reading(test_path)

  max_sen_len = max(max_len_train,max_len_test)

  train_postags_final, train_postags_final_pad, train_chunk_pad = data_processing(chunks_train, postags_train, max_sen_len)


  kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)

  param = {}

  for fold, (train_indices, test_indices) in enumerate(kf.split(train_postags_final, train_chunk_pad)):

      print(f"\nFold: {fold + 1}/{num_splits}")

      X_train, X_test = train_postags_final_pad[train_indices], train_postags_final_pad[test_indices]
      y_train, y_test = train_chunk_pad[train_indices], train_chunk_pad[test_indices]
      X_train_act, train_groundtruth = [], []
      X_test_act, val_groundtruth = [], []

      for ti,tj in zip(train_indices,test_indices):
          X_train_act.append(train_postags_final[ti])
          X_test_act.append(train_postags_final[tj])
          train_groundtruth.append(chunks_train[ti])
          val_groundtruth.append(chunks_train[tj])

      num_batches = len(X_train//batch_size)


      W, bias, W_feedb = paramterinit()
      for i in range(epoch):

        for batch_num in range(num_batches):

            start_index = batch_num * batch_size
            end_index = (batch_num + 1) * batch_size
            batch_data_X = X_train[start_index:end_index]
            batch_data_y = y_train[start_index:end_index]
            if len(batch_data_X) == 0:
                break
            W, W_feedb, bias = BPTT(W, bias, W_feedb, batch_data_X, batch_data_y, lr, max_sen_len)

            #storing all parameter
            param[str(fold+1)] = [W, W_feedb, bias]

        if i % 10 == 0:

              #Train set metric calculation
              train_prediction = predict(W, W_feedb, bias, X_train_act, max_sen_len)

              train_prediction = np.array([one_token for one_sent in train_prediction for one_token in one_sent])
              y_train_observed = np.array([one_token for one_sent in train_groundtruth for one_token in one_sent])
              accuracy, precision, recall, f1 = compute_metric(y_train_observed, train_prediction)
              print("Training metric: \n")
              print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n')

              #Validation set calculation
              val_prediction = predict(W, W_feedb, bias,X_test_act, max_sen_len)

              val_prediction = np.array([one_token for one_sent in val_prediction for one_token in one_sent])
              y_test_observed = np.array([one_token for one_sent in val_groundtruth for one_token in one_sent])
              accuracy, precision, recall, f1 = compute_metric(y_test_observed, val_prediction)
              print("Validation metric: \n")
              print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\n')
              print("***********************************************")

  testing(chunks_test, postags_test, max_sen_len, param)
  return param


In [28]:
param = model("train.jsonl","test.jsonl", 0.001, 5, batch_size=25,num_splits = 5 )


Fold: 1/5
Training metric: 

Accuracy: 83.3161, Precision: 0.8074, Recall: 0.9971, F1 Score: 0.8923

Validation metric: 

Accuracy: 81.8306, Precision: 0.7888, Recall: 0.9977, F1 Score: 0.8811

***********************************************

Fold: 2/5
Training metric: 

Accuracy: 83.7274, Precision: 0.8116, Recall: 0.9976, F1 Score: 0.8951

Validation metric: 

Accuracy: 81.3492, Precision: 0.7840, Recall: 0.9971, F1 Score: 0.8778

***********************************************

Fold: 3/5
Training metric: 

Accuracy: 83.5965, Precision: 0.8098, Recall: 0.9975, F1 Score: 0.8939

Validation metric: 

Accuracy: 81.3315, Precision: 0.7851, Recall: 0.9968, F1 Score: 0.8784

***********************************************

Fold: 4/5
Training metric: 

Accuracy: 83.5898, Precision: 0.8100, Recall: 0.9975, F1 Score: 0.8940

Validation metric: 

Accuracy: 81.1484, Precision: 0.7827, Recall: 0.9967, F1 Score: 0.8768

***********************************************

Fold: 5/5
Training metric: 

In [29]:
param

{'1': [array([[ 0.64471707, -0.83021671, -1.56965126, -0.9094121 ,  1.67792237,
          -1.92840973, -0.40959369, -0.79055402,  2.01067215]]),
  array([[0.31102746]]),
  array([[0.91789416]])],
 '2': [array([[ 0.64256646, -0.82086346, -1.57656429, -0.9121565 ,  1.68059044,
          -1.92611296, -0.40344594, -0.80099216,  2.01287906]]),
  array([[0.3122486]]),
  array([[0.91810745]])],
 '3': [array([[ 0.64455461, -0.82061118, -1.58041056, -0.91404192,  1.68297348,
          -1.92969613, -0.40329982, -0.80254685,  2.01676258]]),
  array([[0.31434993]]),
  array([[0.91699923]])],
 '4': [array([[ 0.6429678 , -0.82679645, -1.57488024, -0.90904204,  1.67871484,
          -1.93362447, -0.40328621, -0.80110701,  2.01773694]]),
  array([[0.31354616]]),
  array([[0.9154987]])],
 '5': [array([[ 0.64263403, -0.8185414 , -1.57328427, -0.9097815 ,  1.6762571 ,
          -1.92425472, -0.40621619, -0.80378743,  2.02029765]]),
  array([[0.31111112]]),
  array([[0.92181876]])]}