In [1]:
# @title Install Transformers
from IPython.display import clear_output
!pip install transformers==2.11

clear_output()

In [2]:
# @title Import Requirements
import numpy as np
import pandas as pd
from transformers import *
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from scipy import cluster as clst
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA


In [3]:
# @title Loading BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2, )
config.output_hidden_states = True

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False

clear_output()

In [4]:
# @title Loading Dataset

# STS benchmark
#df_train = pd.read_csv('train.tsv', delimiter='\t' , error_bad_lines=False)
df_dev = pd.read_csv('dev.tsv', delimiter='\t' , error_bad_lines=False)
#df_test = pd.read_csv('test.tsv', delimiter='\t' , error_bad_lines=False)

In [5]:
# @title Required functions

def sper_corrcoef(targets, predictions):
    """Spearman correlation coefficient."""
    return 100 * sc.stats.spearmanr(targets, predictions)[0]


def mean_pooling(inp_representations, representation_dev):
    """ calculating sentence representations by averaging over the tokens."""

    sum_index=0
    sent_representations=[]
    for i in range(len(representation_dev)):
      sent_representations.append(np.mean(inp_representations[sum_index: sum_index + (len(representation_dev[i]))],axis=0))
      sum_index = sum_index + len(representation_dev[i])

    return sent_representations


def similarity(sent_rep):
    """ calculating cosine similarity between two sentences."""
  
    score = []
    l = 0
    for i in range(int(len(sent_rep)/2)):
        score.append(cosine_similarity(np.reshape(sent_rep[l], (1, 768)),
                                      np.reshape(sent_rep[l + 1], (1, 768)))[0][0])
        l = l + 2

    return score


def isotropy(representations):
    """Calculating isotropy of embedding space based on Eq.2
           arg:
              representations (n_samples, n_dimensions)
            """

    eig_values, eig_vectors = np.linalg.eig(np.matmul(np.transpose(representations),
                                                      representations))
    max_f = -mt.inf
    min_f =  mt.inf

    for i in range(eig_vectors.shape[1]):
        f = np.matmul(representations, np.expand_dims(eig_vectors[:, i], 1))
        f = np.sum(np.exp(f))

        min_f = min(min_f, f)
        max_f = max(max_f, f)

    isotropy = min_f / max_f

    return isotropy


In [7]:
# @title Cluster-based Isotropy Enhancement

def cluster_based(representations, n_cluster: int, n_pc: int):
  """ Improving Isotropy of input representations using cluster-based method
      Args: 
            inputs:
                  representations: 
                    input representations numpy array(n_samples, n_dimension)
                  n_cluster: 
                    the number of clusters
                  n_pc: 
                    the number of directions to be discarded
            output:
                  isotropic representations (n_samples, n_dimension)

            """


  centroid, label=clst.vq.kmeans2(representations, n_cluster, minit='points',
                                  missing='warn', check_finite=True)
  cluster_mean=[]
  for i in range(max(label)+1):
    sum=np.zeros([1,768]);
    for j in np.nonzero(label == i)[0]:
      sum=np.add(sum, representations[j])
    cluster_mean.append(sum/len(label[label == i]))

  zero_mean_representation=[]
  for i in range(len(representations)):
    zero_mean_representation.append((representations[i])-cluster_mean[label[i]])

  cluster_representations={}
  for i in range(n_cluster):
    cluster_representations.update({i:{}})
    for j in range(len(representations)):
      if (label[j]==i):
        cluster_representations[i].update({j:zero_mean_representation[j]})

  cluster_representations2=[]
  for j in range(n_cluster):
    cluster_representations2.append([])
    for key, value in cluster_representations[j].items():
      cluster_representations2[j].append(value)

  cluster_representations2=np.array(cluster_representations2)


  model=PCA()
  post_rep=np.zeros((representations.shape[0],representations.shape[1]))

  for i in range(n_cluster):
      model.fit(np.array(cluster_representations2[i]).reshape((-1,768)))
      component = np.reshape(model.components_, (-1, 768))

      for index in cluster_representations[i]:
        sum_vec = np.zeros((1, 768))

        for j in range(n_pc):
                sum_vec = sum_vec + np.dot(cluster_representations[i][index],
                          np.transpose(component)[:,j].reshape((768,1))) * component[j]
        
        post_rep[index]=cluster_representations[i][index] - sum_vec

  clear_output()

  return post_rep


In [6]:
# @title Getting representations

representation_dev = []
for i in range(len(df_dev)):
    print(i)
    #First sentence
    inputs = tokenizer.encode(df_dev['sentence1'].iloc[i], add_special_tokens=True)
    inputs = np.asarray(inputs, dtype='int32').reshape((1, -1))

    #getting the representation of the last layer
    output = model(inputs)[0]
    output = np.asarray(output).reshape((-1,768))

    #Removing CLS and SEP tokens
    idx = [0, len(output)-1]
    output = np.delete(output, idx, axis= 0)
    output = np.asarray(output).reshape((-1,768))

    representation_dev.append(output)

    #Second sentence
    inputs = tokenizer.encode(df_dev['sentence2'].iloc[i], add_special_tokens=True)
    inputs = np.asarray(inputs, dtype='int32').reshape((1, -1))

    output = model(inputs)[0]
    output = np.asarray(output).reshape((-1,768))

    #Removing CLS and SEP tokens
    idx = [0, len(output)-1]
    output = np.delete(output, idx, axis= 0)
    output = np.asarray(output).reshape((-1,768))

    representation_dev.append(output)

representation_list_dev=[]
for i in range(len(representation_dev)):
  for j in range(len(representation_dev[i])):
      representation_list_dev.append(representation_dev[i][j])

clear_output()

In [8]:
# making the representations isotorpic
n_cluster = 27
n_pc = 12
isotropic_representations = cluster_based(np.asarray(representation_list_dev),
                                          n_cluster, n_pc)

# calculating sentence representations
sentence_rep = mean_pooling(isotropic_representations, representation_dev)

# predicting similarity scores
score = similarity(sentence_rep)


In [9]:
# performance
print("Spearman Correlation: ",sper_corrcoef(df_dev['score'], score))

# isotropy of space
print("Isotropy: ", isotropy(isotropic_representations))

Spearman Correlation:  74.8463511184579
Isotropy:  0.7506211880816394
