In [1]:
import json
import os

In [2]:
# Load two files:
model_outputs = json.load(open('/Users/venkatakesavvenna/Sem6/RSAI/Mid_Project/dataset/nli_output/chatgpt_checker_final.json'))
human_outputs = json.load(open('/Users/venkatakesavvenna/Sem6/RSAI/Mid_Project/dataset/human_annotations/zero_context/nq_chatgpt_answers.json'))

In [3]:
# Print the keys 
print(model_outputs[0].keys())
print(human_outputs[0].keys())

dict_keys(['id', 'response', 'triplets', 'reference', 'Y', 'ys'])
dict_keys(['id', 'response', 'claude2_response_kg'])


In [4]:
# Make an array of the form [triplet, model_output, human_output]
final_array = []

# Check the ids, that are common in both the files
for model_output in model_outputs:
    for human_output in human_outputs:
        if model_output['id'] == human_output['id']:
            final_array.append([model_output["triplets"], model_output["ys"], [entry["human_label"] for entry in human_output["claude2_response_kg"]]])
            

In [5]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Categories
categories = ['Neutral', 'Entailment', 'Contradiction']

# Initialize confusion matrix
conf_matrix = np.zeros((len(categories), len(categories)))

# Count occurrences of each combination of model and human outputs
for entry in final_array:
    model_outputs = entry[1]
    human_outputs = entry[2]

    for model_output, human_output in zip(model_outputs, human_outputs):
        # Get indices of categories
        model_index = categories.index(model_output)
        human_index = categories.index(human_output)

        # Update confusion matrix
        conf_matrix[model_index][human_index] += 1

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Calculate accuracy
accuracy = np.trace(conf_matrix) / np.sum(conf_matrix)
print(f"\nAccuracy: {accuracy}")

# Calculate precision, recall, and F1-score for each category
for i, category in enumerate(categories):
    true_positives = conf_matrix[i][i]
    false_positives = sum(conf_matrix[:, i]) - true_positives
    false_negatives = sum(conf_matrix[i, :]) - true_positives

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    print(f"\nCategory: {category}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1_score}")


Confusion Matrix:
[[ 45.  10.  15.]
 [ 31. 144.  11.]
 [ 13.   8.  29.]]

Accuracy: 0.7124183006535948

Category: Neutral
Precision: 0.5056179775280899
Recall: 0.6428571428571429
F1-score: 0.5660377358490566

Category: Entailment
Precision: 0.8888888888888888
Recall: 0.7741935483870968
F1-score: 0.8275862068965517

Category: Contradiction
Precision: 0.5272727272727272
Recall: 0.58
F1-score: 0.5523809523809523


In [6]:
# Analysis for Alpaca-7B model
human_outputs = json.load(open('dataset/human_annotations/zero_context/nq_llama2_70b_chat_answers.json'))
model_outputs = json.load(open('dataset/nli_output/llama-70b_checker_final.json'))


# Make an array of the form [triplet, model_output, human_output]
final_array = []

# Take the majority votes for the human outputs
for model_output in model_outputs:
    for human_output in human_outputs:
        if model_output['id'] == human_output['id']:
            model_final = [max(set(model_output["ys"]), key = model_output["ys"].count)]
            human_final = [max(set([entry["human_label"] for entry in human_output["claude2_response_kg"]]), key = [entry["human_label"] for entry in human_output["claude2_response_kg"]].count)]

            final_array.append([model_output["triplets"], model_final, human_final])

# Check if the models, human outputs are matching
matching = 0

for entry in final_array:
    if entry[1] == entry[2]:
        matching += 1

print(f"Matching: {matching} out of {len(final_array)}")

accuracy = matching * 100 / len(final_array)
print(f"Accuracy: {accuracy}%")

Matching: 74 out of 94
Accuracy: 78.72340425531915%


In [7]:
# on the human annotated data count the number of entailmet=nts, neutral and contradiction
entailments = 0
contradictions = 0
neutrals = 0

for entry in final_array:
    print(entry[2])
    if entry[2] == ["Entailment"]:
        entailments += 1
    elif entry[2] == ["Contradiction"]:
        contradictions += 1
    else:
        neutrals += 1

print(f"Entailments: {entailments*100/len(final_array)}")
print(f"Contradictions: {contradictions*100/len(final_array)}")
print(f"Neutrals: {neutrals*100/len(final_array)}")


['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Contradiction']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Entailment']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Contradiction']
['Contradiction']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Entailment']
['Contradiction']
['Contradiction']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Contradiction']
['Neutral']
['Entailment']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Entailment']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Entailment']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Entailment']
['Neutral']
['Contradiction']
['Entailment']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Neutral']
['Entailment']
['Neutral']
['Neutral']
['Neutral']
['Neutral'