In [None]:
import os
import re
import json
import ast
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import ast
import pandas as pd


In [None]:
path = '../self_contra/wino_bias_ques_generate_gpt3.5/'
files = os.listdir(path)
files

In [None]:
files.remove('.DS_Store')
len(files)

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
files[0]

In [None]:
data = pd.read_csv(path + files[0])
data.columns

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_emb(question_dict):
    encoded_input = tokenizer(question_dict, padding=True, truncation=False, return_tensors='pt')
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings


In [None]:
for j, file_name in enumerate(files):
    print(j, file_name)
    data = pd.read_csv(path + file_name)
    similarity_value = []
    question_num = []
    for i, row in data.iterrows():
        gene_ques = row['generated_questions']
        ques = row['question']
        question = ques.split('.')[1]
        question_dict = ast.literal_eval(gene_ques)
        temp = [question]
        
        temp.append(question_dict['question1'])
        temp.append(question_dict['question2'])
        temp.append(question_dict['question3'])
        
        for each in question_dict:
            temp.append(each)
        sentence_embeddings = get_emb(temp)
        
        ques_emb = sentence_embeddings[0]
        temp_sim = []
        for each in sentence_embeddings[1:]:
            similarity = cosine_similarity(ques_emb.cpu().numpy().reshape(1, -1), each.cpu().numpy().reshape(1, -1))[0][0]
            temp_sim.append(similarity)
        max_sim = max(temp_sim)
        if max_sim >= 0.5:
            index = temp_sim.index(max_sim)
            similarity_value.append(max_sim)
            question_num.append(index)
            '''
            if index == 0:
                print(question_dict['question1'], max_sim)
            elif index == 1:
                print(question_dict['question2'], max_sim)
            elif index == 2:
                print(question_dict['question3'], max_sim)
            '''
        else:
            similarity_value.append(-1)
            question_num.append(-1)
        
    data['similarity_value'] = similarity_value
    data['question_num'] = question_num
    
    data.to_csv('../self_contra/wino_bias_question_similarity/' + file_name)
            
            

In [None]:
from bert_score import score

# Example: compare candidate prediction to ground truth
candidates = ["The man is playing guitar."]
references = ["A man plays the guitar."]

P, R, F1 = score(candidates, references, lang="en", verbose=True)

print(f"Precision: {P.item():.4f}")
print(f"Recall:    {R.item():.4f}")
print(f"F1 score:  {F1.item():.4f}")
