# We will be removing the keys from the json objects, as they might impact the score.

In [None]:
import pandas as pd

In [None]:
# !pip install openpyxl

In [None]:
NUM_EPOCHS = 10

In [None]:
results = pd.read_excel(f"mistral_finetuned_output_{NUM_EPOCHS}epochs.xlsx")

In [None]:
results.head(3)

In [None]:
print(results.columns)

Prompts: input <br>
Response: Test cases of reference dataset <br>
Output: Test cases by fine tuned LLAMA 3.1 8B model

In [None]:
print(results.loc[10, "Response"])

## Remove \n and \t

In [None]:
def remove_whitespace(value):
	return value.strip().replace("\n", " ").replace("\t", " ")

In [None]:
for type in ["Prompts", "Response", "Output"]:
	results[type] = results[type].apply(lambda x: remove_whitespace(x))

In [None]:
results.head(3)

In [None]:
results.shape

# Remove fixed keys

In [None]:
results_fixed = {} # remove all keys
results_fixed["Response"] = []
results_fixed["Output"] = []
error_indices = []

In [None]:
print(results.loc[10, "Output"])

In [None]:
import json
for idx in range(0, 200):
	both_ok = True # check if both are json parseable
	for type in ["Response", "Output"]:
		tcs = results.loc[idx, type]
		try:
			tcs = json.loads(tcs)
		except:
			both_ok = False
			print(type)
			print(idx)
			print(tcs)
	
	if both_ok:
		results_fixed["Response"].append(json.loads(results.loc[idx, "Response"]))
		results_fixed["Output"].append(json.loads(results.loc[idx, "Output"]))
	else:
		error_indices.append(idx)

In [None]:
print(len(results_fixed["Response"]))
print(len(results_fixed["Output"]))

In [None]:
error_indices

In [None]:
def remove_keys(d):
    """
    remove all first level keys from d, convert it to a string
    """
    return str(list(d.values()))[1:-1]

In [None]:
for type in ["Response", "Output"]:
    for idx, tcs in enumerate(results_fixed[type]):
        new_tcs = []
        if not isinstance(tcs["testcases"], list):
            tcs["testcases"] = [tcs["testcases"]]
        for tc in tcs["testcases"]:
            new_tcs.append(remove_keys(tc))
        # break
        results_fixed[type][idx] = "; ".join(new_tcs)

## For the errored indices, use string replacement

In [None]:
common_keys = ['"testcases":', '"name":', '"description":', '"input":', '"expected":']

In [None]:
def remove_vals(s, vals):
    for val in vals:
        s = s.replace(val, "")
    return s

In [None]:
for idx in error_indices:
	results_fixed["Response"].append(remove_vals(results.loc[idx, "Response"], common_keys))
	results_fixed["Output"].append(remove_vals(results.loc[idx, "Output"], common_keys))

In [None]:
print(len(results_fixed["Response"]))
print(len(results_fixed["Output"]))

In [None]:
print(results_fixed["Response"][63])
print(results_fixed["Output"][63])

In [None]:
print(results_fixed["Response"][-1])
print(results_fixed["Output"][-1])

In [None]:
# !pip install evaluate
# !pip install absl-py
# !pip install nltk
# !pip install rouge-score
# !pip install transformers
# !pip install bert-score
# !pip install --upgrade huggingface_hub
# !pip install accelerate==0.31.0

In [None]:
scores = {}

In [None]:
import evaluate

rouge = evaluate.load("rouge")
	
scores["rouge"] = rouge.compute(predictions=results_fixed["Output"], references=results_fixed["Response"])

print(f"Finetuned scores: {scores["rouge"]}")

In [None]:
import evaluate

bleu = evaluate.load("bleu")
	
scores["blue"] = bleu.compute(predictions=results_fixed["Output"], references=results_fixed["Response"])

print(f"Finetuned scores: {scores["blue"]}")

In [None]:
import evaluate

bert = evaluate.load("bertscore")
	
scores["bert"] = bert.compute(predictions=results_fixed["Output"], references=results_fixed["Response"], lang="en")

print(f"Finetuned scores: {scores["bert"]}")

In [None]:
scores["bert"].keys()

In [None]:
import numpy as np

print(np.mean(scores["bert"]["precision"]))
print(np.mean(scores["bert"]["recall"]))
print(np.mean(scores["bert"]["f1"]))

In [None]:
scores["bert"]["precision"] = np.mean(scores["bert"]["precision"])
scores["bert"]["recall"] = np.mean(scores["bert"]["recall"])
scores["bert"]["f1"] = np.mean(scores["bert"]["f1"])

In [None]:
import json
with open(f"mistral_scores_final_fixed_finetuned_{NUM_EPOCHS}epochs.json", "w") as fp:
	json.dump(scores, fp)