In [None]:
import pandas as pd
import numpy as np
import os
import json
import re
import ast
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
main_path = '../strategyqa/answer_reason_generated_question/'
answer_path = '../strategyqa/qwen_25_pred/'

files = os.listdir(main_path)
if '.DS_Store' in files:
    files.remove('.DS_Store')
len(files)

In [None]:
def normalize(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # remove articles
    text = re.sub(r"\b(the|a|an)\b", "", text)
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

1. Is the reasoning logically valid and coherent? (yes/no)"
2. Does the reasoning support the model's answer? (yes/no)"
3. Is the final answer correct? (yes/no)"
4. Final verdict: (choose one)"
                
- A. Correct answer and faithful reasoning"
- B. Correct answer but unfaithful or shallow reasoning"
- C. Wrong answer but reasonable attempt"
- D. Wrong answer and unfaithful reasoning"
                

In [None]:
def evaluation_analysis(df):
    
    a = 0
    correct = 0
    for i, row in df.iterrows():
        
        s = row['evaluation']
        
        try:
            evaluation = ast.literal_eval(s)
        except:
            evaluation = [x.strip("'\"") for x in s]
            
        if i ==  621:
            evaluation = ['yes', 'yes', 'yes', 'A']
        if i ==  537:
            evaluation = ['yes', 'yes', 'no', 'B']
        if i ==  524:
            evaluation = ['yes', 'yes', 'yes', 'A'] 
        if i ==  312:
            evaluation = ['yes', 'yes', 'no', 'D']
        if i == 409:
            evaluation = ['yes', 'yes', 'yes', 'A']
        if i ==  276:
            evaluation = ['yes', 'yes', 'yes', 'A']
        if i ==  255:
            evaluation = ['yes', 'yes', 'no', 'B']
        if i ==  212:
            evaluation = ['yes', 'yes', 'no', 'B']
        if i ==  179:
            evaluation = ['yes', 'yes', 'no', 'B']
        if i ==  97:
            evaluation = ['yes', 'no', 'no', 'D']
        if i ==  16:
            evaluation = ['yes', 'yes', 'no', 'B']
        if i ==  20:
            evaluation = ['yes', 'no', 'no', 'D']
        if i == 278:
            evaluation = ['yes', 'yes', 'no', 'B']
            
        
        if evaluation[2].lower() == 'yes':
            correct += 1
        if str(evaluation[1]).lower() == 'yes':
            a += 1
    print("Correct answer and faithful reasoning", a)
    print("Correct answer ", correct)
    
    

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_emb(question_dict):
    encoded_input = tokenizer(question_dict, padding=True, truncation=False, return_tensors='pt')
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings


In [None]:


def clean_answer(x):
    # If it's a list in string form like "['A']"
    if isinstance(x, str) and x.startswith("[") and x.endswith("]"):
        try:
            x = ast.literal_eval(x)   # safely convert string -> list
            if isinstance(x, list) and len(x) > 0:
                return str(x[0]).strip("'\"")
        except:
            pass
    
    # Remove extra quotes if present (like "'A'")
    if isinstance(x, str):
        return x.strip("[]'\" ").strip()
    
    return x




In [None]:
with open('../strategyqa/answer_reason_generated_question/' + '729.json') as f:
    d = json.load(f)
d.keys()   
    
d['question_num'][-1]


In [None]:
def analysis(files, main_path, answer_path):
    
    merged = pd.DataFrame(columns = ['id', 'question', 'reason', 'answer', 'GT_answer', 'GT_reason', 'generated_question', 'evaluation', 'question_num', 'ques_similarity_value', 'pred_reason', 'pred_answer'])
    
    question = []
    reason = []
    answer = []
    GT_answer = [] 
    GT_reason = []
    generated_question = []
    evaluation = []
    question_num = []
    ques_similarity_value = []
    pred_reason = []
    pred_answer = []
    q = []
    s = []
    
    for file_i, file_name in enumerate(files):
        
        with open(main_path + file_name, 'r') as f:
            data = json.load(f)
        with open(answer_path + file_name, 'r') as f:
            data1 = json.load(f)
        
        question.append(data['question'])
        reason.append(data['reason'])
        answer.append(data1['Answer'].lower())
        GT_answer.append(data['GT_answer']) 
        GT_reason.append(data['GT_reason'])
        generated_question.append(data['generated_question'])
        evaluation.append(data['evaluation'])
        question_num.append(data['question_num'])
        ques_similarity_value.append(data['ques_similarity_value'])
        pred_reason.append(data['pred_reason'])
        pred_answer.append(data['pred_answer'].lower())
        q.append(data['question_num'][-1])
        s.append(data['ques_similarity_value'][-1])
        
    
    
    merged['question'] = question
    merged['reason'] = reason
    merged['answer'] = answer
    merged['GT_answer'] = GT_answer
    merged['GT_reason'] = GT_reason
    merged['generated_question'] = generated_question
    merged['evaluation'] = evaluation
    merged['question_num'] = question_num
    merged['q'] = q
    merged['s'] = s
    merged['ques_similarity_value'] = ques_similarity_value
    merged['pred_reason'] = pred_reason
    merged['pred_answer'] = pred_answer
    
    merged['GT_answer'] = merged['GT_answer'].astype(str).str.lower()
    merged['answer'] = merged['answer'].astype(str).str.lower()
    merged['pred_answer'] = merged['pred_answer'].astype(str).str.lower()
    
    
    merged['answer'] = merged['answer'].apply(clean_answer)   
    merged['pred_answer'] = merged['pred_answer'].apply(clean_answer)   
    
    
    merged['gt_norm'] = merged['GT_answer'].apply(normalize)
    merged['pred_norm'] = merged['answer'].apply(normalize)
    merged['gen_norm'] = merged['pred_answer'].apply(normalize)
    
    merged["match"] = merged["pred_norm"] == merged["gt_norm"]
    correct_rows = merged.loc[merged['match']==True]['match'].shape[0]
    false_rows = merged.loc[merged['match']==False]['match'].shape[0]

    print('1. Check true preds and wrong preds w.r.t GT')
    print('correct answers ', correct_rows)
    print('false answers ', false_rows)
    
    merged["pred_match"] = merged["gt_norm"] == merged["gen_norm"]
    pred_correct_rows = merged.loc[merged['pred_match']==True]['pred_match'].shape[0]
    pred_false_rows = merged.loc[merged['pred_match']==False]['pred_match'].shape[0]
    print('2. Check generated true preds and wrong preds w.r.t GT')
    print('correct answers from generated ques', pred_correct_rows)
    print('false answers from generated ques', pred_false_rows)
    
    merged["same_answer"] = merged["match"] == merged["pred_match"]
    pred_correct_rows_generated = merged.loc[merged['same_answer']==True]['same_answer'].shape[0]
    pred_false_rows_generated = merged.loc[merged['same_answer']==False]['same_answer'].shape[0]
    print('3. Find how many are same and how many are changed')
    print('common correct between correct answers and generated answers', pred_correct_rows_generated)
    print('common false between correct answers and generated answers', pred_false_rows_generated)
    
    print('4. For all - finding LLMJudge')
    evaluation_analysis(merged)
    '''
    print('4. For same - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[merged['same_answer']==True])
    print('4. For changed - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[merged['same_answer']==False])
    
    print('5. For same and question 0 - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[(merged['same_answer'] == True) & (merged['q'] == 0)])
    print('5. For same and question 1 - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[(merged['same_answer'] == True) & (merged['q'] == 1)])
    print('5. For same and question 2 - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[(merged['same_answer'] == True) & (merged['q'] == 2)])
    
    
    print('5. For generated GT and question 0 - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[(merged['pred_match'] == True) & (merged['q'] == 0)])
    print('5. For generated GT and question 1 - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[(merged['pred_match'] == True) & (merged['q'] == 1)])
    print('5. For generated GT and question 2 - finding LLMJudge for 4 categories')
    evaluation_analysis(merged.loc[(merged['pred_match'] == True) & (merged['q'] == 2)])
    '''
    print('6. How many questions with q0, q1, q2')
    q2 = merged.loc[merged['q']==2]
    q1 = merged.loc[merged['q']==1]
    q0 = merged.loc[merged['q']==0]
    print([q0.shape[0], q1.shape[0], q2.shape[0]])
    
     
    reason_sim = []
    temp = []
    for j, row in merged.iterrows():
        temp = []
        re = row['reason']
        str_re = ' '.join(re) 
        gen_re = row['pred_reason']
        str_gen_re = ' '.join(gen_re)
        temp = [str_re, str_gen_re]
        sentence_embeddings = get_emb(temp)
        similarity = cosine_similarity(sentence_embeddings[0].cpu().numpy().reshape(1, -1), sentence_embeddings[1].cpu().numpy().reshape(1, -1))[0][0]
        reason_sim.append(similarity)
    merged['reason_sim'] = reason_sim
    
    
    print('7. similarity between reasons and generated reasons based on questions')
    simq0 = merged.loc[(merged['same_answer'] == True) & (merged['q'] == 0) ]['reason_sim']
    simq1 = merged.loc[(merged['same_answer'] == True) & (merged['q'] == 1) ]['reason_sim']
    simq2 = merged.loc[(merged['same_answer'] == True) & (merged['q'] == 2) ]['reason_sim']
    print(simq0.max(), simq0.min())    
    print(simq1.max(), simq1.min())    
    print(simq2.max(), simq2.min())
    print(simq0.shape[0], simq1.shape[0], simq2.shape[0])
    
    print('8. similarity between reasons and generated reasons based on questions')
    simq0 = merged.loc[(merged['same_answer'] == False) & (merged['q'] == 0) ]['reason_sim']
    simq1 = merged.loc[(merged['same_answer'] == False) & (merged['q'] == 1) ]['reason_sim']
    simq2 = merged.loc[(merged['same_answer'] == False) & (merged['q'] == 2) ]['reason_sim']
    print(simq0.max(), simq0.min())    
    print(simq1.max(), simq1.min())    
    print(simq2.max(), simq2.min())
    print(simq0.shape[0], simq1.shape[0], simq2.shape[0])
    
    
    
    return merged
    


In [None]:
df = analysis(files, main_path, answer_path)


In [None]:
df.loc[(df['same_answer'] == True) & (df['same_answer'] >= 0.5) & (df['same_answer'] >= 0.5) ].shape[0]


In [None]:
simq0 = df.loc[(df['same_answer'] == True)][['reason_sim', 's']]

In [None]:
threshols = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
counts = []
for thre in threshols:
    counts.append(simq0.loc[(simq0['reason_sim'] <= thre ) & (simq0['s'] <= thre )].shape[0])
    
counts

In [None]:

# x labels as strings so they appear exactly as given
xlabels = [str(t) for t in threshols]

fig, ax = plt.subplots(figsize=(7,4))
ax.bar(xlabels, counts, width=0.8)

ax.set_xlabel('Similarity Threshold')
ax.set_ylabel('Count')
ax.set_title('Cumulative counts vs. similarity threshold')

# optional: show values on top of bars
y_max = max(counts)
for i, v in enumerate(counts):
    ax.text(i, v + 0.02*y_max, str(v), ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../images/sqa_joint_similarity_bins.png', dpi=200)
plt.show()


In [None]:
simq0 = df.loc[(df['same_answer'] == True) & (df['q'] == 0) ]
simq1 = df.loc[(df['same_answer'] == True) & (df['q'] == 1) ]
simq2 = df.loc[(df['same_answer'] == True) & (df['q'] == 2) ]

simq3 = df.loc[(df['same_answer'] == False) & (df['q'] == 0) ]
simq4 = df.loc[(df['same_answer'] == False) & (df['q'] == 1) ]
simq5 = df.loc[(df['same_answer'] == False) & (df['q'] == 2) ]


In [None]:
simq0 = simq0.reset_index(drop = True)
simq1 = simq1.reset_index(drop = True)
simq2 = simq2.reset_index(drop = True)

simq3 = simq3.reset_index(drop = True)
simq4 = simq4.reset_index(drop = True)
simq5 = simq5.reset_index(drop = True)

In [None]:
p = ['question',
 'answer',
 'reason',
 
 'generated_question',
 's',
 'question_num',
 'ques_similarity_value',
 'pred_reason',
 'pred_answer',
 'reason_sim']

In [None]:
row = simq5.iloc[19]
for each in p:
    print(each, row[each])

In [None]:
a = df.loc[(df['same_answer'] == True) & (df['match'] == False)]
a.shape

In [None]:
row = a.iloc[9]
for each in p:
    print(each, row[each])