In [1]:
# @title Install Transformers
from IPython.display import clear_output
!pip install transformers==2.11

clear_output()

In [2]:
# @title Import Requirements
import os
import numpy as np
import pandas as pd
from transformers import *
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from scipy import cluster as clst
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA


In [3]:
# @title Loading BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2, )
config.output_hidden_states = True

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False

clear_output()

In [None]:
# @title Loading Dataset

# STS benchmark
#df_train = pd.read_csv('train.tsv', delimiter='\t' , error_bad_lines=False)
df_dev = pd.read_csv('dev.tsv', delimiter='\t' , error_bad_lines=False)
#df_test = pd.read_csv('test.tsv', delimiter='\t' , error_bad_lines=False)

In [4]:
# @title Required functions

def sper_corrcoef(targets, predictions):
    """Spearman correlation coefficient."""
    return 100 * sc.stats.spearmanr(targets, predictions)[0]


def mean_pooling(inp_representations, representation_dev):
    """ calculating sentence representations by averaging over the tokens."""

    sum_index=0
    sent_representations=[]
    for i in range(len(representation_dev)):
      sent_representations.append(np.mean(inp_representations[sum_index: sum_index + (len(representation_dev[i]))],axis=0))
      sum_index = sum_index + len(representation_dev[i])

    return sent_representations


def similarity(sent_rep):
    """ calculating cosine similarity between two sentences."""
  
    score = []
    l = 0
    for i in range(int(len(sent_rep)/2)):
        score.append(cosine_similarity(np.reshape(sent_rep[l], (1, 768)),
                                      np.reshape(sent_rep[l + 1], (1, 768)))[0][0])
        l = l + 2

    return score


def isotropy(representations):
    """Calculating isotropy of embedding space based on Eq.2
           arg:
              representations (n_samples, n_dimensions)
            """

    eig_values, eig_vectors = np.linalg.eig(np.matmul(np.transpose(representations),
                                                      representations))
    max_f = -mt.inf
    min_f =  mt.inf

    for i in range(eig_vectors.shape[1]):
        f = np.matmul(representations, np.expand_dims(eig_vectors[:, i], 1))
        f = np.sum(np.exp(f))

        min_f = min(min_f, f)
        max_f = max(max_f, f)

    isotropy = min_f / max_f

    return isotropy


In [5]:
# @title Cluster-based Isotropy Enhancement

def cluster_based(representations, n_cluster: int, n_pc: int):
  """ Improving Isotropy of input representations using cluster-based method
      Args: 
            inputs:
                  representations: 
                    input representations numpy array(n_samples, n_dimension)
                  n_cluster: 
                    the number of clusters
                  n_pc: 
                    the number of directions to be discarded
            output:
                  isotropic representations (n_samples, n_dimension)

            """


  centroid, label=clst.vq.kmeans2(representations, n_cluster, minit='points',
                                  missing='warn', check_finite=True)
  cluster_mean=[]
  for i in range(max(label)+1):
    sum=np.zeros([1,768]);
    for j in np.nonzero(label == i)[0]:
      sum=np.add(sum, representations[j])
    cluster_mean.append(sum/len(label[label == i]))

  zero_mean_representation=[]
  for i in range(len(representations)):
    zero_mean_representation.append((representations[i])-cluster_mean[label[i]])

  cluster_representations={}
  for i in range(n_cluster):
    cluster_representations.update({i:{}})
    for j in range(len(representations)):
      if (label[j]==i):
        cluster_representations[i].update({j:zero_mean_representation[j]})

  cluster_representations2=[]
  for j in range(n_cluster):
    cluster_representations2.append([])
    for key, value in cluster_representations[j].items():
      cluster_representations2[j].append(value)

  cluster_representations2=np.array(cluster_representations2)


  model=PCA()
  post_rep=np.zeros((representations.shape[0],representations.shape[1]))

  for i in range(n_cluster):
      model.fit(np.array(cluster_representations2[i]).reshape((-1,768)))
      component = np.reshape(model.components_, (-1, 768))

      for index in cluster_representations[i]:
        sum_vec = np.zeros((1, 768))

        for j in range(n_pc):
                sum_vec = sum_vec + np.dot(cluster_representations[i][index],
                          np.transpose(component)[:,j].reshape((768,1))) * component[j]
        
        post_rep[index]=cluster_representations[i][index] - sum_vec

  clear_output()

  return post_rep


In [None]:
# @title Getting representations

representation_dev = []
for i in range(len(df_dev)):
    print(i)
    #First sentence
    inputs = tokenizer.encode(df_dev['sentence1'].iloc[i], add_special_tokens=True)
    inputs = np.asarray(inputs, dtype='int32').reshape((1, -1))

    #getting the representation of the last layer
    output = model(inputs)[0]
    output = np.asarray(output).reshape((-1,768))

    #Removing CLS and SEP tokens
    idx = [0, len(output)-1]
    output = np.delete(output, idx, axis= 0)
    output = np.asarray(output).reshape((-1,768))

    representation_dev.append(output)

    #Second sentence
    inputs = tokenizer.encode(df_dev['sentence2'].iloc[i], add_special_tokens=True)
    inputs = np.asarray(inputs, dtype='int32').reshape((1, -1))

    output = model(inputs)[0]
    output = np.asarray(output).reshape((-1,768))

    #Removing CLS and SEP tokens
    idx = [0, len(output)-1]
    output = np.delete(output, idx, axis= 0)
    output = np.asarray(output).reshape((-1,768))

    representation_dev.append(output)

representation_list_dev=[]
for i in range(len(representation_dev)):
  for j in range(len(representation_dev[i])):
      representation_list_dev.append(representation_dev[i][j])

clear_output()

In [None]:
# making the representations isotorpic
n_cluster = 27
n_pc = 12
isotropic_representations = cluster_based(np.asarray(representation_list_dev),
                                          n_cluster, n_pc)

# calculating sentence representations
sentence_rep = mean_pooling(isotropic_representations, representation_dev)

# predicting similarity scores
score = similarity(sentence_rep)


In [None]:
# performance
print("Spearman Correlation: ",sper_corrcoef(df_dev['score'], score))

# isotropy of space
print("Isotropy: ", isotropy(isotropic_representations))

Spearman Correlation:  74.8463511184579
Isotropy:  0.7506211880816394


#Classification Tasks

In [30]:
# @title Loading Data set

## BoolQ
df_train = pd.read_json('train.jsonl' , lines=True)
df_dev = pd.read_json('val.jsonl', lines=True)
df_test = pd.read_json('test.jsonl', lines=True)

#### training set labels
for i in range(len(df_train)):
  if df_train['label'].iloc[i] == False:
    df_train['label'].iloc[i] = 0
  else:
    df_train['label'].iloc[i] = 1

#### validation set labels
for i in range(len(df_dev)):
  if df_dev['label'].iloc[i] == False:
    df_dev['label'].iloc[i] = 0
  else:
    df_dev['label'].iloc[i] = 1

clear_output()

In [31]:
# @title Tokenizer
def tokenize(df, tokenizer):
    input_ids, input_masks, input_segments, input_offsets = [],[],[],[]
    for i in range(len(df)):
        inputs = tokenizer.encode_plus(df['question'].iloc[i], df['passage'].iloc[i], add_special_tokens=True, 
                                             return_attention_mask=True, return_token_type_ids=True, max_length = 64,pad_to_max_length=True )

        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])  

    return tf.cast(input_ids,tf.int32),tf.cast(input_masks,tf.int32), tf.cast(input_segments,tf.int32) 

In [None]:
# @title Making the inputs ready

train_ids, train_masks, train_segments = tokenize(df_train, tokenizer)
dev_ids, dev_masks, dev_segments = tokenize(df_dev, tokenizer)
test_ids, test_masks, test_segments = tokenize(df_test, tokenizer)

In [11]:
# @title Getting Representations

representation_train = []

i = 0
while i < len(df_train):
    print(i)
    if i != 9000:
      train_inp = [train_ids[i:i + 1000], train_masks[i : i +1000], train_segments[i : i + 1000]]
    else:
      train_inp = [train_ids[i:], train_masks[i :], train_segments[i :]]
    output = model(train_inp)[0]
    output = np.asarray(output).reshape((-1,64,768))
    for j in range(output.shape[0]):
        representation_train.append(output[j].reshape(64,768))
    i = i + 1000

clear_output()
print("done!")

done!


In [12]:
# @title Getting Representations

representation_dev = []
i = 0

while i < len(df_dev):
    print(i)
    if i != 3000:
      dev_inp = [dev_ids[i:i + 1000], dev_masks[i : i +1000], dev_segments[i : i + 1000]]
    else:
      dev_inp = [dev_ids[i:], dev_masks[i :], dev_segments[i :]]
    output = model(dev_inp)[0]
    output = np.asarray(output).reshape((-1,64,768))
    for j in range(output.shape[0]):
        representation_dev.append(output[j].reshape(64,768))
    i = i + 1000

clear_output()
print("done!")

done!


In [14]:
# @title Getting Representations

representation_test = []
i = 0

while i < len(df_test):
    print(i)
    if i != 3000:
      test_inp = [test_ids[i:i + 1000], test_masks[i : i +1000], test_segments[i : i + 1000]]
    else:
      test_inp = [test_ids[i:], test_masks[i :], test_segments[i :]]
    output = model(test_inp)[0]
    output = np.asarray(output).reshape((-1,64,768))
    for j in range(output.shape[0]):
        representation_test.append(output[j].reshape(64,768))
    i = i + 1000

clear_output()
print("done!")

done!


In [13]:
# @title Again Data!
train_data = np.reshape(representation_train, (-1, 64, 768))
dev_data = np.reshape(representation_dev, (-1, 64, 768))
# test_data = np.reshape(representation_test, (-1, 64, 768))

train_label = np.asarray(list(df_train['label']))
dev_label = np.asarray(list(df_dev['label']))
train_label.shape

In [29]:
# @title Building the MLP

inp = tf.keras.layers.Input(shape=(64,768))
out = tf.keras.layers.Flatten()(inp)
out = tf.keras.layers.Dense(100, activation='relu')(out)
out = tf.keras.layers.Dense(1, activation='sigmoid')(out)
model = tf.keras.Model(inputs = inp , outputs = out)

optimizer = tf.keras.optimizers.Adam(3e-5)
model.compile(loss='binary_crossentropy',
              optimizer= optimizer,
              metrics=['acc'])

In [25]:
# @title Checkpoint to save the best model

class ModelCheckpoint(tf.keras.callbacks.Callback):
  def __init__(self, monitor, save_path):
    super(ModelCheckpoint, self).__init__()
    self.monitor = monitor
    self.save_path = save_path
    self.bestScore = -np.Inf
    self.bestLoss = np.Inf

  def on_epoch_end(self, epoch, logs=None):
    score = logs.get(self.monitor)
    loss = logs.get("val_loss")
    if score > self.bestScore or (score == self.bestScore and loss < self.bestLoss):
      path = os.path.join(SAVED_MODELS_DIR, str(epoch+1))
      os.makedirs(path)
      self.model.save_weights(path+'/best_weights.h5')
      self.bestScore = score
      self.bestLoss = loss
      print("\nModel saved as the best model")

monitor = "val_acc"
SAVED_MODELS_DIR = '/content/saved_models/'
checkpoint = ModelCheckpoint(monitor, SAVED_MODELS_DIR)

In [26]:
# @title Training the model
history = model.fit(train_data, train_label, epochs = 10, validation_data=(dev_data, dev_label), batch_size = 32, callbacks=[checkpoint] )

Epoch 1/10

Model saved as the best model
Epoch 2/10

Model saved as the best model
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# @title Making the representations isotropic
n_cluster = 27
n_pc = 12
representation = list(train_data.reshape(-1, 768)) + list(dev_data.reshape(-1, 768)) 
isotropic_representations = cluster_based(np.asarray(representation),
                                          n_cluster, n_pc)

isotropic_train_data = isotropic_representations[: len(list(train_data.reshape(-1, 768)))]
isotropic_dev_data = isotropic_representations[len(list(train_data.reshape(-1, 768))): ]

isotropic_train_data = np.asarray(isotropic_train_data).reshape(-1, 64, 768)
isotropic_dev_data = np.asarray(isotropic_dev_data).reshape(-1, 64, 768)

In [21]:
# @title Training the model

history = model.fit(isotropic_train_data, train_label, epochs = 10, 
                    validation_data=(isotropic_dev_data, dev_label), batch_size = 32, callbacks=[checkpoint] )

Epoch 1/10

Model saved as the best model
Epoch 2/10
Epoch 3/10

Model saved as the best model
Epoch 4/10

Model saved as the best model
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
# @title Retrieving the best model

list_of_dirs = os.listdir('/content/saved_models/')

final_list = list(map(int, list_of_dirs))
best_model = max(final_list)

model_path ="/content/saved_models/"+ str(best_model) + '/best_weights.h5'
model.load_weights(model_path)