In [2]:
from sentence_transformers import SentenceTransformer, util
import json
import pandas as pd

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

In [6]:
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173859e-02 -4.28515673e-02 -1.56286396e-02  1.40537666e-02
  3.95537689e-02  1.21796280e-01  2.94333734e-02 -3.17523964e-02
  3.54959518e-02 -7.93140158e-02  1.75878331e-02 -4.04369831e-02
  4.97259945e-02  2.54912488e-02 -7.18699917e-02  8.14968422e-02
  1.47068244e-03  4.79627140e-02 -4.50335853e-02 -9.92174894e-02
 -2.81769391e-02  6.45046011e-02  4.44670543e-02 -4.76217195e-02
 -3.52952220e-02  4.38671522e-02 -5.28565757e-02  4.33055917e-04
  1.01921491e-01  1.64072458e-02  3.26996818e-02 -3.45986970e-02
  1.21339280e-02  7.94871077e-02  4.58343420e-03  1.57778468e-02
 -9.68206860e-03  2.87626330e-02 -5.05806319e-02 -1.55793764e-02
 -2.87907086e-02 -9.62278619e-03  3.15556638e-02  2.27349475e-02
  8.71449336e-02 -3.85027565e-02 -8.84718895e-02 -8.75497889e-03
 -2.12343149e-02  2.08924133e-02 -9.02078152e-02 -5.25732450e-02
 -1.05638457e-02  2.88311075e-02 -1.61454547e-02  6.17839070e-03
 -1.23234

In [7]:
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6788
I love pasta 		 Do you like pizza? 		 Score: 0.5096
I love pasta 		 The new movie is so great 		 Score: 0.2560
I love pasta 		 The new movie is awesome 		 Score: 0.2440
A man is playing guitar 		 The cat plays in the garden 		 Score: 0.2105
The new movie is awesome 		 Do you like pizza? 		 Score: 0.1969
The new movie is so great 		 Do you like pizza? 		 Score: 0.1692
The cat sits outside 		 A woman watches TV 		 Score: 0.1310
The cat plays in the garden 		 Do you like pizza? 		 Score: 0.0900


In [14]:
with open('data/chaosNLI/data/chaosNLI_v1.0/chaosNLI_snli.jsonl', 'r') as f:
    data = f.read()
data_dict = []
c = 0
for d in data.split('\n')[:-1]:
    data_dict.append(json.loads(d))
df_snli_full = pd.DataFrame(data_dict)

In [15]:
with open('data/chaosNLI/data/chaosNLI_v1.0/chaosNLI_mnli_m.jsonl', 'r') as f:
    data = f.read()
data_dict = []
c = 0
for d in data.split('\n')[:-1]:
    data_dict.append(json.loads(d))

df_mnli_full = pd.DataFrame(data_dict)

In [46]:
df_chaos_full = pd.concat([df_snli_full, df_mnli_full])

In [47]:
df_chaos_full_rand = df_chaos_full.sample(frac=1, random_state=100).reset_index(drop=True)

In [42]:
df_chaos_full['premise'] = [x['premise'] for x in df_chaos_full['example']]
df_chaos_full['hypothesis'] = [x['hypothesis'] for x in df_chaos_full['example']]

In [49]:
df_chaos_full_rand

Unnamed: 0,uid,label_counter,majority_label,label_dist,label_count,entropy,example,old_label,old_labels
0,111680e,"{'e': 91, 'n': 7, 'c': 2}",e,"[0.91, 0.07, 0.02]","[91, 7, 2]",0.505248,"{'uid': '111680e', 'premise': 'He dismounted a...",e,"[entailment, entailment, entailment, neutral, ..."
1,8221794333.jpg#0r1n,"{'c': 14, 'e': 22, 'n': 64}",n,"[0.22, 0.64, 0.14]","[22, 64, 14]",1.289752,"{'uid': '8221794333.jpg#0r1n', 'premise': 'A y...",e,"[neutral, entailment, entailment, entailment, ..."
2,4637931300.jpg#2r1n,"{'c': 10, 'n': 90}",n,"[0.0, 0.9, 0.1]","[0, 90, 10]",0.468996,"{'uid': '4637931300.jpg#2r1n', 'premise': 'A m...",n,"[neutral, neutral, neutral, contradiction, con..."
3,36715n,"{'n': 80, 'c': 18, 'e': 2}",n,"[0.02, 0.8, 0.18]","[2, 80, 18]",0.815727,"{'uid': '36715n', 'premise': 'Jon twisted the ...",n,"[neutral, neutral, contradiction, neutral, con..."
4,4116163419.jpg#0r2n,"{'e': 2, 'n': 95, 'c': 3}",n,"[0.02, 0.95, 0.03]","[2, 95, 3]",0.334944,"{'uid': '4116163419.jpg#0r2n', 'premise': 'A m...",e,"[neutral, neutral, entailment, entailment, ent..."
...,...,...,...,...,...,...,...,...,...
3108,105769e,"{'n': 25, 'e': 71, 'c': 4}",e,"[0.71, 0.25, 0.04]","[71, 25, 4]",1.036572,"{'uid': '105769e', 'premise': 'yeah yeah i i w...",n,"[entailment, neutral, neutral, entailment, neu..."
3109,2814952319.jpg#0r1n,"{'n': 95, 'c': 5}",n,"[0.0, 0.95, 0.05]","[0, 95, 5]",0.286397,"{'uid': '2814952319.jpg#0r1n', 'premise': 'Nin...",n,"[neutral, neutral, contradiction, contradictio..."
3110,21297n,"{'n': 74, 'c': 2, 'e': 24}",n,"[0.24, 0.74, 0.02]","[24, 74, 2]",0.928470,"{'uid': '21297n', 'premise': 'He was crying li...",n,"[neutral, neutral, entailment, neutral, entail..."
3111,11971n,"{'e': 52, 'n': 44, 'c': 4}",e,"[0.52, 0.44, 0.04]","[52, 44, 4]",1.197478,"{'uid': '11971n', 'premise': 'In a six-year st...",e,"[neutral, neutral, entailment, entailment, ent..."


In [51]:
df_chaos_full[df_chaos_full['uid'] == '8221794333.jpg#0r1n']

Unnamed: 0,uid,label_counter,majority_label,label_dist,label_count,entropy,example,old_label,old_labels
221,8221794333.jpg#0r1n,"{'c': 14, 'e': 22, 'n': 64}",n,"[0.22, 0.64, 0.14]","[22, 64, 14]",1.289752,"{'uid': '8221794333.jpg#0r1n', 'premise': 'A y...",e,"[neutral, entailment, entailment, entailment, ..."
