## NLP sentence similarity project
### Group: Sara Bonati - Irina Kokoshko

This notebook applies a state of the art Transformer vector space model (BERT) to sentence data taken from 3 different datasets:

*   A subset of news sentence pairs from the STS benchmark, which we refer to as STS1
*   A bigger subset from the STS benchmark containing sentence pairs from news/captions/internet forums from 2012 to 2017, which we refer to as STSFull
*   Semantically ambiguous sentence pairs used in a survey online, which we refer to as Survey

In this notebook the BERT model is varied in terms of: 
*  hidden layers used (from 1 to 12 layers)
*  aggregation strategy for the word embeddings derived from the hidden layers (average, concatenate or sum)

In [None]:
# general utility import
#---------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec
import pickle
import math
import pandas as pd
import sys,io,pprint
import re
from functools import partial
from tqdm import tqdm
tqdm = partial(tqdm, position=0, leave=True)

# nltk modules
from __future__ import division
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import wordnet as wn, stopwords
from nltk.corpus import brown, gutenberg
from nltk.stem import WordNetLemmatizer
from nltk.data import find

#BERT modules
!pip install transformers
import transformers
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 7.6MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 46.6MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95

# STS1 dataset

### Load data + sentence preprocessing + normalize score values 

In [None]:
# read data
#-------------------------------------------------------------------------------
sts1 =pd.read_pickle('sentences_shorter.pkl') # change file directory if needed

# preprocessing function
#-------------------------------------------------------------------------------
def preprocess_text(text):
    # convert to string
    text = str(text)
    #lowercase
    text = text.lower()
    # Clean the text
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"she\'s", "she is", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"he\'s", "he is", text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " ", text) 
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"\<", "", text) 
    text = re.sub(r"\>", "", text) 
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    tockens = text.split(' ')
    # NO stopwords or lemmatization
    tockens = [word for word in tockens]
    return ' '.join([t for t in tockens])

for index, row in sts1.iterrows():
    sts1.loc[index, "SentenceA"] = preprocess_text(sts1.loc[index, "SentenceA"])
    sts1.loc[index, "SentenceB"] = preprocess_text(sts1.loc[index, "SentenceB"])

# normalize Score to be in range [0,1]
sts1["Score"] = sts1["Score"]/5
sts1.head()

Unnamed: 0,SentenceA,SentenceB,Score
1,micron has declared its first quarterly profit...,micron numbers also marked the first quarterly...,0.75
2,the fines are part of failed republican effort...,perry said he backs the senate efforts includi...,0.56
4,the tech loaded nasdaq composite rose 20 96 po...,the technology laced nasdaq composite index ix...,0.48
5,amgen shares gained 93 cents or 1 45 percent t...,shares of allergan were up 14 cents at 78 40 i...,0.2666
7,chavez said investigators feel confident they ...,albuquerque mayor martin chavez said investiga...,0.76


## BERT model specification ( see also https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/) 

In [None]:
# initialize bert tokenizer
bert_tokenizer  = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
bert_model = BertModel.from_pretrained('bert-base-uncased',
                                        output_hidden_states = True)

def bert_semantic_similarity(sentence1,sentence2,embedding_method,layers,sentence_vec=False):
    
    if layers>12:
        print("Error! Maximum number of layers to use is 12")
        return None
    
    # Tokenize our sentence with the BERT tokenizer.
    sentence1       = "[CLS] " + sentence1 + " [SEP]" 
    sentence2       = "[CLS] " + sentence2 + " [SEP]"
    tokenized_text1 = bert_tokenizer.tokenize(sentence1)
    tokenized_text2 = bert_tokenizer.tokenize(sentence2)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens1 = bert_tokenizer.convert_tokens_to_ids(tokenized_text1)    
    indexed_tokens2 = bert_tokenizer.convert_tokens_to_ids(tokenized_text2)  

    # Mark each of the tokens as belonging to sentence "0" and "1".
    segments_ids1 = [0] * len(tokenized_text1) 
    segments_ids2 = [1] * len(tokenized_text2) 

    # Convert inputs to PyTorch tensors
    tokens_tensor1    = torch.tensor([indexed_tokens1])
    segments_tensors1 = torch.tensor([segments_ids1])
    tokens_tensor2    = torch.tensor([indexed_tokens2])
    segments_tensors2 = torch.tensor([segments_ids2])

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    bert_model.eval()   

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs1 = bert_model(tokens_tensor1, segments_tensors1)
        outputs2 = bert_model(tokens_tensor2, segments_tensors2)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states1 = outputs1[2]
        hidden_states2 = outputs2[2]

    token_embeddings1 = torch.stack(hidden_states1, dim=0)
    token_embeddings1 = torch.squeeze(token_embeddings1, dim=1)
    token_embeddings1 = token_embeddings1.permute(1,0,2)
    token_embeddings2 = torch.stack(hidden_states2, dim=0)
    token_embeddings2 = torch.squeeze(token_embeddings2, dim=1)
    token_embeddings2 = token_embeddings2.permute(1,0,2)

    if embedding_method == "concat":
        token_vecs_cat1   = torch.empty(len(token_embeddings1),768*layers)
        token_vecs_cat2   = torch.empty(len(token_embeddings2),768*layers)
    if embedding_method == "average" or embedding_method == "sum":
        token_vecs_cat1   = torch.empty(len(token_embeddings1),768)
        token_vecs_cat2   = torch.empty(len(token_embeddings2),768)

    # For each token in the sentence 1...
    for token in range(len(token_embeddings1)):

        if embedding_method == "concat":
            
            if layers==1:
                cat_vec1 = token_embeddings1[token][-1]
            elif layers==2:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2]), dim=0)
            elif layers==3:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3]), dim=0)
            elif layers==4:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4]), dim=0)
            elif layers==5:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5]), dim=0)
            elif layers==6:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6]), dim=0)
            elif layers==7:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7]), dim=0)
            elif layers==8:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8]), dim=0)
            elif layers==9:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9]), dim=0)
            elif layers==10:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9],
                                      token_embeddings1[token][-10]), dim=0)
            elif layers==11:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9],
                                      token_embeddings1[token][-10],
                                      token_embeddings1[token][-11]), dim=0)
            elif layers==12:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9],
                                      token_embeddings1[token][-10],
                                      token_embeddings1[token][-11],
                                      token_embeddings1[token][-12]), dim=0)


        if embedding_method == "average":
            cat_vec1 = torch.mean(token_embeddings1[token][-layers:], dim=0)
            
        if embedding_method == "sum":
            cat_vec1 = torch.sum(token_embeddings1[token][-layers:], dim=0)
            
        # embedding for token word (single word) 
        token_vecs_cat1[token,:]=cat_vec1
    #sentence 1 embedding 
    sentence_embedding1 = torch.mean(token_vecs_cat1,dim=0)

    # For each token in the sentence 2...
    for token in range(len(token_embeddings2)):    
        if embedding_method == "concat":
            if layers==1:
                cat_vec2 = token_embeddings2[token][-1]
            elif layers==2:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2]), dim=0)
            elif layers==3:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3]), dim=0)
            elif layers==4:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4]), dim=0)
            elif layers==5:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5]), dim=0)
            elif layers==6:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6]), dim=0)
            elif layers==7:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7]), dim=0)
            elif layers==8:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8]), dim=0)
            elif layers==9:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9]), dim=0)
            elif layers==10:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9],
                                      token_embeddings2[token][-10]), dim=0)
            elif layers==11:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9],
                                      token_embeddings2[token][-10],
                                      token_embeddings2[token][-11]), dim=0)
            elif layers==12:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9],
                                      token_embeddings2[token][-10],
                                      token_embeddings2[token][-11],
                                      token_embeddings2[token][-12]), dim=0)

            
        if embedding_method == "average":
            cat_vec2 = torch.mean(token_embeddings2[token][-layers:], dim=0)
        if embedding_method == "sum":
            cat_vec2 = torch.sum(token_embeddings2[token][-layers:], dim=0)
        
        token_vecs_cat2[token,:]=cat_vec2
    # sentence embedding
    sentence_embedding2 = torch.mean(token_vecs_cat2,dim=0)
    
    cos         = nn.CosineSimilarity(dim=0)
    sem_sim     = cos(sentence_embedding1,sentence_embedding2)
    return sem_sim.item()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
#average
#------------------------------------------------------------------------------
results_average     = np.zeros((len(sts1),12)) 
corrs_average       = np.zeros((12,1))
print("Embedding strategy: average")
for i in range(12):
    print("Start from final hidden layer and average: ",i+1)
    print('\n')
    for s in tqdm(range(len(sts1))):
            results_average[s,i] = bert_semantic_similarity(str(sts1.iloc[s,0]),
                                                    str(sts1.iloc[s,1]),
                                                    "average",
                                                    i+1,
                                                    False)
    d = pd.DataFrame({'Score_BERT': results_average[:,i]})
    corrs_average[i,0] = d["Score_BERT"].corr(sts1["Score"])

  0%|          | 0/1218 [00:00<?, ?it/s]

Embedding strategy: average
Start from final hidden layer and average:  1




100%|██████████| 1218/1218 [04:25<00:00,  4.59it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  2




100%|██████████| 1218/1218 [04:36<00:00,  4.41it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  3




100%|██████████| 1218/1218 [04:25<00:00,  4.59it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  4




100%|██████████| 1218/1218 [04:25<00:00,  4.59it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  5




100%|██████████| 1218/1218 [04:24<00:00,  4.61it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  6




100%|██████████| 1218/1218 [04:20<00:00,  4.68it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  7




100%|██████████| 1218/1218 [04:21<00:00,  4.65it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  8




100%|██████████| 1218/1218 [04:20<00:00,  4.68it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  9




100%|██████████| 1218/1218 [04:20<00:00,  4.68it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  10




100%|██████████| 1218/1218 [04:22<00:00,  4.63it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  11




100%|██████████| 1218/1218 [04:24<00:00,  4.60it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and average:  12




100%|██████████| 1218/1218 [04:23<00:00,  4.63it/s]


In [None]:
corrs_average

array([[0.20010996],
       [0.2076047 ],
       [0.20868153],
       [0.21395806],
       [0.21592632],
       [0.21560297],
       [0.21530191],
       [0.21688952],
       [0.21939925],
       [0.22202356],
       [0.22444178],
       [0.22653227]])

In [None]:
#concat
#------------------------------------------------------------------------------
results_concat     = np.zeros((len(sts1),12)) 
corrs_concat       = np.zeros((12,1))
print("Embedding strategy: concatenate")
for i in range(12):
    print("Start from final hidden layer and concatenate: ",i+1)
    print('\n')
    for s in tqdm(range(len(sts1))):
            results_concat[s,i] = bert_semantic_similarity(str(sts1.iloc[s,0]),
                                                    str(sts1.iloc[s,1]),
                                                    "concat",
                                                    i+1,
                                                    False)
    d = pd.DataFrame({'Score_BERT': results_concat[:,i]})
    corrs_concat[i,0] = d["Score_BERT"].corr(sts1["Score"])

  0%|          | 0/1218 [00:00<?, ?it/s]

Embedding strategy: concatenate


100%|██████████| 1218/1218 [04:22<00:00,  4.63it/s]
100%|██████████| 1218/1218 [04:22<00:00,  4.63it/s]
100%|██████████| 1218/1218 [04:23<00:00,  4.61it/s]
100%|██████████| 1218/1218 [04:23<00:00,  4.62it/s]
100%|██████████| 1218/1218 [04:26<00:00,  4.58it/s]
100%|██████████| 1218/1218 [04:22<00:00,  4.64it/s]
100%|██████████| 1218/1218 [04:23<00:00,  4.63it/s]
100%|██████████| 1218/1218 [04:23<00:00,  4.61it/s]
100%|██████████| 1218/1218 [04:22<00:00,  4.64it/s]
100%|██████████| 1218/1218 [04:21<00:00,  4.65it/s]
100%|██████████| 1218/1218 [04:24<00:00,  4.61it/s]
100%|██████████| 1218/1218 [04:24<00:00,  4.60it/s]


In [None]:
corrs_concat

array([[0.20010996],
       [0.20617443],
       [0.20719479],
       [0.21101616],
       [0.2125952 ],
       [0.21362439],
       [0.21418603],
       [0.21683454],
       [0.21990334],
       [0.22271046],
       [0.2255876 ],
       [0.22908105]])

In [None]:
#sum
#------------------------------------------------------------------------------
results_sum     = np.zeros((len(sts1),12)) 
corrs_sum       = np.zeros((12,1))
print("Embedding strategy: sum")
for i in range(12):
    print("Start from final hidden layer and sum: ",i+1)
    print('\n')
    for s in tqdm(range(len(sts1))):
            results_sum[s,i] = bert_semantic_similarity(str(sts1.iloc[s,0]),
                                                    str(sts1.iloc[s,1]),
                                                    "sum",
                                                    i+1,
                                                    False)
    d = pd.DataFrame({'Score_BERT': results_sum[:,i]})
    corrs_sum[i,0] = d["Score_BERT"].corr(sts1["Score"])

  0%|          | 0/1218 [00:00<?, ?it/s]

Embedding strategy: sum
Start from final hidden layer and sum:  1




100%|██████████| 1218/1218 [04:25<00:00,  4.59it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  2




100%|██████████| 1218/1218 [04:23<00:00,  4.62it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  3




100%|██████████| 1218/1218 [04:28<00:00,  4.54it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  4




100%|██████████| 1218/1218 [04:27<00:00,  4.55it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  5




100%|██████████| 1218/1218 [04:30<00:00,  4.50it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  6




100%|██████████| 1218/1218 [04:33<00:00,  4.46it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  7




100%|██████████| 1218/1218 [04:30<00:00,  4.50it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  8




100%|██████████| 1218/1218 [04:29<00:00,  4.52it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  9




100%|██████████| 1218/1218 [04:28<00:00,  4.54it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  10




100%|██████████| 1218/1218 [04:30<00:00,  4.50it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  11




100%|██████████| 1218/1218 [04:26<00:00,  4.57it/s]
  0%|          | 0/1218 [00:00<?, ?it/s]

Start from final hidden layer and sum:  12




100%|██████████| 1218/1218 [04:25<00:00,  4.59it/s]


In [None]:
corrs_sum

array([[0.20010996],
       [0.2076047 ],
       [0.20868148],
       [0.21395806],
       [0.21592632],
       [0.21560296],
       [0.21530182],
       [0.21688952],
       [0.21939919],
       [0.22202359],
       [0.22444171],
       [0.22653219]])

In [None]:
STS1survey = [corrs_average,corrs_concat,corrs_sum]

# ---------------------------------------------------------------------------------------------------------

# STS full dataset

### Load data + sentence preprocessing + normalize score values 

In [None]:
# read data
#-------------------------------------------------------------------------------
stsfull =pd.read_csv('sts.txt',engine='python',sep='\t', quotechar='"',header=0,error_bad_lines=False)

# preprocessing function (see cells above)
#-------------------------------------------------------------------------------
for index, row in stsfull.iterrows():
    stsfull.loc[index, "sent_1"] = preprocess_text(stsfull.loc[index, "sent_1"])
    stsfull.loc[index, "sent_2"] = preprocess_text(stsfull.loc[index, "sent_2"])
    #print a few examples
    if index in [4000,4500,3400,2345,3331]:
        print(stsfull.loc[index, "sent_1"])
        print(stsfull.loc[index, "sent_2"])

# normalize Score to be in range [0,1]
stsfull["sim"] = stsfull["sim"]/5
stsfull.head()

Skipping line 1525: '	' expected after '"'
Skipping line 1542: '	' expected after '"'
Skipping line 1614: '	' expected after '"'
Skipping line 2002: '	' expected after '"'
Skipping line 2003: '	' expected after '"'
Skipping line 2006: '	' expected after '"'
Skipping line 2020: '	' expected after '"'
Skipping line 2026: '	' expected after '"'
Skipping line 2030: '	' expected after '"'
Skipping line 2031: '	' expected after '"'
Skipping line 2035: '	' expected after '"'
Skipping line 2037: '	' expected after '"'
Skipping line 2048: '	' expected after '"'
Skipping line 2055: '	' expected after '"'
Skipping line 2072: '	' expected after '"'
Skipping line 2073: '	' expected after '"'
Skipping line 2076: '	' expected after '"'
Skipping line 2080: '	' expected after '"'
Skipping line 2083: '	' expected after '"'
Skipping line 2090: '	' expected after '"'
Skipping line 2094: '	' expected after '"'
Skipping line 2096: '	' expected after '"'
Skipping line 2097: '	' expected after '"'
Skipping li

do you understand why it is absurd to limit the question to us residents 
do you understand why non us residents are also relevant to the statistics 
the russian foreign ministry stated that russia has not suspended dialogue 
russian officials stated that russia will not reconsider the suspension 
illegal arms trafficking increases in kenya due to easy availability and neighboring conflicts 
illegal arms trafficking has increased in kenya and raised concern about security 
washington acts of congress sells for 9 8m
george washington copy of us constitution sells for 9 8m
philippines rebels reach wealth sharing deal
philippines and rebels reach wealth deal 


Unnamed: 0,type,subtype,year,id,sim,sent_1,sent_2
0,main-captions,MSRvid,2012test,1,1.0,a plane is taking off,an air plane is taking off
1,main-captions,MSRvid,2012test,4,0.76,a man is playing a large flute,a man is playing a flute
2,main-captions,MSRvid,2012test,5,0.76,a man is spreading shreded cheese on a pizza,a man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,0.52,three men are playing chess,two men are playing chess
4,main-captions,MSRvid,2012test,9,0.85,a man is playing the cello,a man seated is playing the cello


In [None]:
# initialize bert tokenizer
bert_tokenizer  = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
bert_model = BertModel.from_pretrained('bert-base-uncased',
                                        output_hidden_states = True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
def bert_semantic_similarity(sentence1,sentence2,embedding_method,layers,sentence_vec=False):
    
    if layers>12:
        print("Error! Maximum number of layers to use is 12")
        return None
    
    # Tokenize our sentence with the BERT tokenizer.
    sentence1       = "[CLS] " + sentence1 + " [SEP]" 
    sentence2       = "[CLS] " + sentence2 + " [SEP]"
    tokenized_text1 = bert_tokenizer.tokenize(sentence1)
    tokenized_text2 = bert_tokenizer.tokenize(sentence2)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens1 = bert_tokenizer.convert_tokens_to_ids(tokenized_text1)    
    indexed_tokens2 = bert_tokenizer.convert_tokens_to_ids(tokenized_text2)  

    # Mark each of the tokens as belonging to sentence "0" and "1".
    segments_ids1 = [0] * len(tokenized_text1) 
    segments_ids2 = [1] * len(tokenized_text2) 

    # Convert inputs to PyTorch tensors
    tokens_tensor1    = torch.tensor([indexed_tokens1])
    segments_tensors1 = torch.tensor([segments_ids1])
    tokens_tensor2    = torch.tensor([indexed_tokens2])
    segments_tensors2 = torch.tensor([segments_ids2])

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    bert_model.eval()   

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs1 = bert_model(tokens_tensor1, segments_tensors1)
        outputs2 = bert_model(tokens_tensor2, segments_tensors2)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states1 = outputs1[2]
        hidden_states2 = outputs2[2]

    token_embeddings1 = torch.stack(hidden_states1, dim=0)
    token_embeddings1 = torch.squeeze(token_embeddings1, dim=1)
    token_embeddings1 = token_embeddings1.permute(1,0,2)
    token_embeddings2 = torch.stack(hidden_states2, dim=0)
    token_embeddings2 = torch.squeeze(token_embeddings2, dim=1)
    token_embeddings2 = token_embeddings2.permute(1,0,2)

    if embedding_method == "concat":
        token_vecs_cat1   = torch.empty(len(token_embeddings1),768*layers)
        token_vecs_cat2   = torch.empty(len(token_embeddings2),768*layers)
    if embedding_method == "average" or embedding_method == "sum":
        token_vecs_cat1   = torch.empty(len(token_embeddings1),768)
        token_vecs_cat2   = torch.empty(len(token_embeddings2),768)

    # For each token in the sentence 1...
    for token in range(len(token_embeddings1)):

        if embedding_method == "concat":
            
            if layers==1:
                cat_vec1 = token_embeddings1[token][-1]
            elif layers==2:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2]), dim=0)
            elif layers==3:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3]), dim=0)
            elif layers==4:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4]), dim=0)
            elif layers==5:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5]), dim=0)
            elif layers==6:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6]), dim=0)
            elif layers==7:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7]), dim=0)
            elif layers==8:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8]), dim=0)
            elif layers==9:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9]), dim=0)
            elif layers==10:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9],
                                      token_embeddings1[token][-10]), dim=0)
            elif layers==11:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9],
                                      token_embeddings1[token][-10],
                                      token_embeddings1[token][-11]), dim=0)
            elif layers==12:
                cat_vec1 = torch.cat((token_embeddings1[token][-1],
                                      token_embeddings1[token][-2],
                                      token_embeddings1[token][-3],
                                      token_embeddings1[token][-4],
                                      token_embeddings1[token][-5],
                                      token_embeddings1[token][-6],
                                      token_embeddings1[token][-7],
                                      token_embeddings1[token][-8],
                                      token_embeddings1[token][-9],
                                      token_embeddings1[token][-10],
                                      token_embeddings1[token][-11],
                                      token_embeddings1[token][-12]), dim=0)


        if embedding_method == "average":
            cat_vec1 = torch.mean(token_embeddings1[token][-layers:], dim=0)
            
        if embedding_method == "sum":
            cat_vec1 = torch.sum(token_embeddings1[token][-layers:], dim=0)
            
        # embedding for token word (single word) 
        token_vecs_cat1[token,:]=cat_vec1
    #sentence 1 embedding 
    sentence_embedding1 = torch.mean(token_vecs_cat1,dim=0)

    # For each token in the sentence 2...
    for token in range(len(token_embeddings2)):    
        if embedding_method == "concat":
            if layers==1:
                cat_vec2 = token_embeddings2[token][-1]
            elif layers==2:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2]), dim=0)
            elif layers==3:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3]), dim=0)
            elif layers==4:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4]), dim=0)
            elif layers==5:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5]), dim=0)
            elif layers==6:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6]), dim=0)
            elif layers==7:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7]), dim=0)
            elif layers==8:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8]), dim=0)
            elif layers==9:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9]), dim=0)
            elif layers==10:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9],
                                      token_embeddings2[token][-10]), dim=0)
            elif layers==11:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9],
                                      token_embeddings2[token][-10],
                                      token_embeddings2[token][-11]), dim=0)
            elif layers==12:
                cat_vec2 = torch.cat((token_embeddings2[token][-1],
                                      token_embeddings2[token][-2],
                                      token_embeddings2[token][-3],
                                      token_embeddings2[token][-4],
                                      token_embeddings2[token][-5],
                                      token_embeddings2[token][-6],
                                      token_embeddings2[token][-7],
                                      token_embeddings2[token][-8],
                                      token_embeddings2[token][-9],
                                      token_embeddings2[token][-10],
                                      token_embeddings2[token][-11],
                                      token_embeddings2[token][-12]), dim=0)

            
        if embedding_method == "average":
            cat_vec2 = torch.mean(token_embeddings2[token][-layers:], dim=0)
        if embedding_method == "sum":
            cat_vec2 = torch.sum(token_embeddings2[token][-layers:], dim=0)
        
        token_vecs_cat2[token,:]=cat_vec2
    # sentence embedding
    sentence_embedding2 = torch.mean(token_vecs_cat2,dim=0)
    
    cos         = nn.CosineSimilarity(dim=0)
    sem_sim     = cos(sentence_embedding1,sentence_embedding2)
    return sem_sim.item()

In [None]:
#average
#-------------------------------------------------------------------------------
results_average     = np.zeros((len(stsfull),12)) 
corrs_average       = np.zeros((12,1))
print("Embedding strategy: average")
print("-----------------------------------------------------------------------")
for i in range(12):
    print("Start from final hidden layer and average: ",i+1)
    print('\n')
    for s in tqdm(range(len(stsfull))):
            results_average[s,i] = bert_semantic_similarity(str(stsfull.iloc[s,5]),
                                                    str(stsfull.iloc[s,6]),
                                                    "average",
                                                    i+1,
                                                    False)
    d = pd.DataFrame({'Score_BERT': results_average[:,i]})
    corrs_average[i,0] = d["Score_BERT"].corr(stsfull["sim"])

corrs_average

  0%|          | 0/5506 [00:00<?, ?it/s]

Embedding strategy: average
-----------------------------------------------------------------------
Start from final hidden layer and average:  1




100%|██████████| 5506/5506 [20:20<00:00,  4.51it/s]
  0%|          | 1/5506 [00:00<16:57,  5.41it/s]

Start from final hidden layer and average:  2




100%|██████████| 5506/5506 [19:35<00:00,  4.69it/s]
  0%|          | 1/5506 [00:00<16:19,  5.62it/s]

Start from final hidden layer and average:  3




100%|██████████| 5506/5506 [19:34<00:00,  4.69it/s]
  0%|          | 1/5506 [00:00<16:21,  5.61it/s]

Start from final hidden layer and average:  4




100%|██████████| 5506/5506 [19:28<00:00,  4.71it/s]
  0%|          | 1/5506 [00:00<15:55,  5.76it/s]

Start from final hidden layer and average:  5




100%|██████████| 5506/5506 [19:26<00:00,  4.72it/s]
  0%|          | 1/5506 [00:00<17:01,  5.39it/s]

Start from final hidden layer and average:  6




100%|██████████| 5506/5506 [19:29<00:00,  4.71it/s]
  0%|          | 1/5506 [00:00<16:43,  5.49it/s]

Start from final hidden layer and average:  7




100%|██████████| 5506/5506 [19:35<00:00,  4.69it/s]
  0%|          | 1/5506 [00:00<17:23,  5.28it/s]

Start from final hidden layer and average:  8




100%|██████████| 5506/5506 [19:36<00:00,  4.68it/s]
  0%|          | 1/5506 [00:00<16:44,  5.48it/s]

Start from final hidden layer and average:  9




100%|██████████| 5506/5506 [19:27<00:00,  4.72it/s]
  0%|          | 0/5506 [00:00<?, ?it/s]

Start from final hidden layer and average:  10




100%|██████████| 5506/5506 [19:30<00:00,  4.70it/s]
  0%|          | 1/5506 [00:00<16:33,  5.54it/s]

Start from final hidden layer and average:  11




100%|██████████| 5506/5506 [19:28<00:00,  4.71it/s]
  0%|          | 1/5506 [00:00<16:56,  5.41it/s]

Start from final hidden layer and average:  12




100%|██████████| 5506/5506 [19:33<00:00,  4.69it/s]


array([[0.54571588],
       [0.55601959],
       [0.55286227],
       [0.55074705],
       [0.55759656],
       [0.56365985],
       [0.56978935],
       [0.57645434],
       [0.58311187],
       [0.59148556],
       [0.60128193],
       [0.61130921]])

In [None]:
#concat  
#-------------------------------------------------------------------------------
results_concat    = np.zeros((len(stsfull),12)) 
corrs_concat       = np.zeros((12,1))
print("Embedding strategy: concat")
print("-----------------------------------------------------------------------")
for i in range(12):
    print("Start from final hidden layer and concatenate: ",i+1)
    print('\n')
    for s in tqdm(range(len(stsfull))):
        results_concat[s,i] = bert_semantic_similarity(str(stsfull.iloc[s,5]),
                                                str(stsfull.iloc[s,6]),
                                                "concat",
                                                i+1,
                                                False)
    d = pd.DataFrame({'Score_BERT': results_concat[:,i]})
    corrs_concat[i,0] = d["Score_BERT"].corr(stsfull["sim"])
corrs_concat

  0%|          | 0/5506 [00:00<?, ?it/s]

Embedding strategy: concat
-----------------------------------------------------------------------


100%|██████████| 5506/5506 [19:39<00:00,  4.67it/s]
100%|██████████| 5506/5506 [19:34<00:00,  4.69it/s]
100%|██████████| 5506/5506 [19:11<00:00,  4.78it/s]
100%|██████████| 5506/5506 [19:01<00:00,  4.82it/s]
100%|██████████| 5506/5506 [19:01<00:00,  4.82it/s]
100%|██████████| 5506/5506 [19:07<00:00,  4.80it/s]
100%|██████████| 5506/5506 [19:07<00:00,  4.80it/s]
100%|██████████| 5506/5506 [19:10<00:00,  4.78it/s]
100%|██████████| 5506/5506 [19:09<00:00,  4.79it/s]
100%|██████████| 5506/5506 [19:04<00:00,  4.81it/s]
100%|██████████| 5506/5506 [19:09<00:00,  4.79it/s]
100%|██████████| 5506/5506 [19:12<00:00,  4.78it/s]


array([[0.54571588],
       [0.55199919],
       [0.54735038],
       [0.54294713],
       [0.54668375],
       [0.55087216],
       [0.55500596],
       [0.56002039],
       [0.56489546],
       [0.57168385],
       [0.58111416],
       [0.59094402]])

In [None]:
#sum    
#-------------------------------------------------------------------------------   
results_sum     = np.zeros((len(stsfull),12)) 
corrs_sum       = np.zeros((12,1))
print("Embedding strategy: sum")
print("-----------------------------------------------------------------------")
for i in range(12):
    print("Start from final hidden layer and sum: ",i+1)
    print('\n')
    for s in tqdm(range(len(stsfull))):
        results_sum[s,i] = bert_semantic_similarity(str(stsfull.iloc[s,5]),
                                                str(stsfull.iloc[s,6]),
                                                "sum",
                                                i+1,
                                                False)
    d = pd.DataFrame({'Score_BERT': results_sum[:,i]})
    corrs_sum[i,0] = d["Score_BERT"].corr(stsfull["sim"])
corrs_sum

  0%|          | 0/5506 [00:00<?, ?it/s]

Embedding strategy: sum
-----------------------------------------------------------------------
Start from final hidden layer and sum:  1




100%|██████████| 5506/5506 [20:50<00:00,  4.40it/s]
  0%|          | 0/5506 [00:00<?, ?it/s]

Start from final hidden layer and sum:  2




100%|██████████| 5506/5506 [20:42<00:00,  4.43it/s]
  0%|          | 0/5506 [00:00<?, ?it/s]

Start from final hidden layer and sum:  3




100%|██████████| 5506/5506 [20:42<00:00,  4.43it/s]
  0%|          | 0/5506 [00:00<?, ?it/s]

Start from final hidden layer and sum:  4




100%|██████████| 5506/5506 [20:37<00:00,  4.45it/s]
  0%|          | 0/5506 [00:00<?, ?it/s]

Start from final hidden layer and sum:  5




100%|██████████| 5506/5506 [20:40<00:00,  4.44it/s]
  0%|          | 1/5506 [00:00<17:52,  5.13it/s]

Start from final hidden layer and sum:  6




100%|██████████| 5506/5506 [20:34<00:00,  4.46it/s]
  0%|          | 1/5506 [00:00<17:44,  5.17it/s]

Start from final hidden layer and sum:  7




100%|██████████| 5506/5506 [20:51<00:00,  4.40it/s]
  0%|          | 1/5506 [00:00<17:33,  5.23it/s]

Start from final hidden layer and sum:  8




100%|██████████| 5506/5506 [20:50<00:00,  4.40it/s]
  0%|          | 0/5506 [00:00<?, ?it/s]

Start from final hidden layer and sum:  9




100%|██████████| 5506/5506 [20:52<00:00,  4.40it/s]
  0%|          | 1/5506 [00:00<17:54,  5.13it/s]

Start from final hidden layer and sum:  10




100%|██████████| 5506/5506 [20:56<00:00,  4.38it/s]
  0%|          | 1/5506 [00:00<18:05,  5.07it/s]

Start from final hidden layer and sum:  11




100%|██████████| 5506/5506 [21:10<00:00,  4.33it/s]
  0%|          | 1/5506 [00:00<17:43,  5.17it/s]

Start from final hidden layer and sum:  12




100%|██████████| 5506/5506 [20:43<00:00,  4.43it/s]


array([[0.54571588],
       [0.55601959],
       [0.55286223],
       [0.55074705],
       [0.55759656],
       [0.56365983],
       [0.56978939],
       [0.57645434],
       [0.58311193],
       [0.59148558],
       [0.60128192],
       [0.61130922]])

In [None]:
FULLsurvey = [corrs_average,corrs_concat,corrs_sum]

# Survey dataset

### Load data + sentence preprocessing + normalize score values 

In [None]:
#read data
#-------------------------------------------------------------------------------
survey = pd.read_pickle('survey.pkl')

# text preprocessing
#-------------------------------------------------------------------------------
for index, row in survey.iterrows():
    survey.loc[index, "sent_1"] = preprocess_text(survey.loc[index, "sent_1"])
    survey.loc[index, "sent_2"] = preprocess_text(survey.loc[index, "sent_2"])

# due to survey design similarity scores are already in range (0,1)
# so no need to normalize like for previous datasets

  1%|          | 1/100 [00:00<00:16,  6.08it/s]

Embedding strategy: average
-----------------------------------------------------------------------
Start from final hidden layer and average:  1




100%|██████████| 100/100 [00:19<00:00,  5.03it/s]
  1%|          | 1/100 [00:00<00:15,  6.31it/s]

Start from final hidden layer and average:  2




100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
  1%|          | 1/100 [00:00<00:15,  6.56it/s]

Start from final hidden layer and average:  3




100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
  1%|          | 1/100 [00:00<00:15,  6.51it/s]

Start from final hidden layer and average:  4




100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
  1%|          | 1/100 [00:00<00:15,  6.22it/s]

Start from final hidden layer and average:  5




100%|██████████| 100/100 [00:19<00:00,  5.03it/s]
  1%|          | 1/100 [00:00<00:15,  6.35it/s]

Start from final hidden layer and average:  6




100%|██████████| 100/100 [00:20<00:00,  4.91it/s]
  1%|          | 1/100 [00:00<00:15,  6.57it/s]

Start from final hidden layer and average:  7




100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
  1%|          | 1/100 [00:00<00:16,  6.09it/s]

Start from final hidden layer and average:  8




100%|██████████| 100/100 [00:19<00:00,  5.00it/s]
  1%|          | 1/100 [00:00<00:15,  6.36it/s]

Start from final hidden layer and average:  9




100%|██████████| 100/100 [00:19<00:00,  5.03it/s]
  1%|          | 1/100 [00:00<00:15,  6.52it/s]

Start from final hidden layer and average:  10




100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
  1%|          | 1/100 [00:00<00:15,  6.59it/s]

Start from final hidden layer and average:  11




100%|██████████| 100/100 [00:19<00:00,  5.03it/s]
  1%|          | 1/100 [00:00<00:15,  6.26it/s]

Start from final hidden layer and average:  12




100%|██████████| 100/100 [00:19<00:00,  5.06it/s]
  1%|          | 1/100 [00:00<00:15,  6.37it/s]

Embedding strategy: concat
-----------------------------------------------------------------------


100%|██████████| 100/100 [00:20<00:00,  4.98it/s]
100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
100%|██████████| 100/100 [00:20<00:00,  4.93it/s]
100%|██████████| 100/100 [00:19<00:00,  5.08it/s]
100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
100%|██████████| 100/100 [00:19<00:00,  5.03it/s]
100%|██████████| 100/100 [00:20<00:00,  4.98it/s]
100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
100%|██████████| 100/100 [00:20<00:00,  4.94it/s]
100%|██████████| 100/100 [00:20<00:00,  4.98it/s]
100%|██████████| 100/100 [00:19<00:00,  5.02it/s]
  1%|          | 1/100 [00:00<00:15,  6.46it/s]

Embedding strategy: sum
-----------------------------------------------------------------------
Start from final hidden layer and sum:  1




100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
  1%|          | 1/100 [00:00<00:15,  6.60it/s]

Start from final hidden layer and sum:  2




100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
  1%|          | 1/100 [00:00<00:15,  6.30it/s]

Start from final hidden layer and sum:  3




100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
  1%|          | 1/100 [00:00<00:15,  6.28it/s]

Start from final hidden layer and sum:  4




100%|██████████| 100/100 [00:20<00:00,  4.95it/s]
  1%|          | 1/100 [00:00<00:16,  6.15it/s]

Start from final hidden layer and sum:  5




100%|██████████| 100/100 [00:19<00:00,  5.01it/s]
  1%|          | 1/100 [00:00<00:15,  6.27it/s]

Start from final hidden layer and sum:  6




100%|██████████| 100/100 [00:19<00:00,  5.02it/s]
  1%|          | 1/100 [00:00<00:14,  6.66it/s]

Start from final hidden layer and sum:  7




100%|██████████| 100/100 [00:19<00:00,  5.05it/s]
  1%|          | 1/100 [00:00<00:15,  6.32it/s]

Start from final hidden layer and sum:  8




100%|██████████| 100/100 [00:19<00:00,  5.03it/s]
  1%|          | 1/100 [00:00<00:15,  6.60it/s]

Start from final hidden layer and sum:  9




100%|██████████| 100/100 [00:19<00:00,  5.04it/s]
  1%|          | 1/100 [00:00<00:16,  5.85it/s]

Start from final hidden layer and sum:  10




100%|██████████| 100/100 [00:19<00:00,  5.03it/s]
  1%|          | 1/100 [00:00<00:16,  6.18it/s]

Start from final hidden layer and sum:  11




100%|██████████| 100/100 [00:19<00:00,  5.00it/s]
  1%|          | 1/100 [00:00<00:16,  6.10it/s]

Start from final hidden layer and sum:  12




100%|██████████| 100/100 [00:19<00:00,  5.03it/s]


### BERT

In [None]:
# NLP MODEL
#-------------------------------------------------------------------------------
#average
results_average     = np.zeros((len(survey),12)) 
corrs_average       = np.zeros((12,1))
print("Embedding strategy: average")
print("-----------------------------------------------------------------------")
for i in range(12):
    print("Start from final hidden layer and average: ",i+1)
    print('\n')
    for s in tqdm(range(len(survey))):
            results_average[s,i] = bert_semantic_similarity(str(survey.iloc[s,0]),
                                                    str(survey.iloc[s,1]),
                                                    "average",
                                                    i+1,
                                                    False)
    d = pd.DataFrame({'Score_BERT': results_average[:,i]})
    corrs_average[i,0] = d["Score_BERT"].corr(survey["score"])

#concat  
results_concat    = np.zeros((len(survey),12)) 
corrs_concat       = np.zeros((12,1))
print("Embedding strategy: concat")
print("-----------------------------------------------------------------------")
for i in range(12):
    print("Start from final hidden layer and concatenate: ",i+1)
    print('\n')
    for s in tqdm(range(len(survey))):
        results_concat[s,i] = bert_semantic_similarity(str(survey.iloc[s,0]),
                                                str(survey.iloc[s,1]),
                                                "concat",
                                                i+1,
                                                False)
    d = pd.DataFrame({'Score_BERT': results_concat[:,i]})
    corrs_concat[i,0] = d["Score_BERT"].corr(survey["score"])

#sum       
results_sum     = np.zeros((len(survey),12)) 
corrs_sum       = np.zeros((12,1))
print("Embedding strategy: sum")
print("-----------------------------------------------------------------------")
for i in range(12):
    print("Start from final hidden layer and sum: ",i+1)
    print('\n')
    for s in tqdm(range(len(survey))):
        results_sum[s,i] = bert_semantic_similarity(str(survey.iloc[s,0]),
                                                str(survey.iloc[s,1]),
                                                "sum",
                                                i+1,
                                                False)
    d = pd.DataFrame({'Score_BERT': results_sum[:,i]})
    corrs_sum[i,0] = d["Score_BERT"].corr(survey["score"])

Csurvey = [corrs_average,corrs_concat,corrs_sum]

In [None]:
corrs_average

array([[0.30095393],
       [0.30423126],
       [0.30181611],
       [0.29714807],
       [0.28552133],
       [0.27008947],
       [0.25636052],
       [0.24018106],
       [0.22346263],
       [0.20668065],
       [0.19100335],
       [0.17729887]])

In [None]:
corrs_concat

array([[0.30095393],
       [0.30863485],
       [0.3060348 ],
       [0.30329809],
       [0.29734291],
       [0.28909291],
       [0.28091583],
       [0.27090136],
       [0.26094301],
       [0.25090895],
       [0.24028653],
       [0.23062581]])

In [None]:
corrs_sum

array([[0.30095393],
       [0.30423126],
       [0.30181612],
       [0.29714807],
       [0.28552127],
       [0.27008968],
       [0.25636034],
       [0.24018106],
       [0.22346223],
       [0.20668128],
       [0.19100386],
       [0.17729893]])