# Evaluation

In [None]:
import json
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from sklearn import metrics
from rouge import Rouge

import matplotlib.pyplot as plt

## Summary Length

In [None]:
### Functions ####
##################

def getOriginalSummary(uri):
    with open(uri) as f:
        ticket = json.load(f)

    sum = ""    
    if ticket['Summary'] is not None:
        sum = ticket['Summary']
    
    return sum    

def computeROUGE(reference, predicted):
    rouge = Rouge()
    scores = [rouge.get_scores(new, old) for old, new in zip(reference, predicted)]

    total_scores = {"rouge-1": {"f": 0, "p": 0, "r": 0}, "rouge-2": {"f": 0, "p": 0, "r": 0}, "rouge-l": {"f": 0, "p": 0, "r": 0}}

    for score in scores:
        for key in total_scores.keys():
            total_scores[key]['f'] += score[0][key]['f']
            total_scores[key]['p'] += score[0][key]['p']
            total_scores[key]['r'] += score[0][key]['r']

    for key in total_scores.keys():
        total_scores[key]['f'] /= len(scores)
        total_scores[key]['p'] /= len(scores)
        total_scores[key]['r'] /= len(scores)
    
    return total_scores  

def createEvaluationCSV(directory):
    evalDF = pd.DataFrame(columns=["jira", "ticketId", "evolution", "reruns", "ticket_uri", "output_uri", "violation_actual", "violation_predicted", "summary_original", "length_original", "summary_old", "length_old", "summary_new", "length_new", "correction_in_range"])

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        outputUri = directory + filename

        if not filename.endswith(".json"):
            continue

        with open(outputUri) as f:
            result = json.load(f)

        if (len(result["output"]["summary_new"]) <= 70) & (len(result["output"]["summary_new"]) >= 39):
            in_range = "TRUE"
        else:
            in_range = "FALSE"

        summary_org = getOriginalSummary(result["input_data"]["ticket_uri"])

        new_row = {
            'jira': result["input_data"]["jira"], 
            'ticketId': result["input_data"]["id"], 
            'evolution': result["input_data"]["evolution"], 
            'reruns': result["reruns"],
            'ticket_uri': result["input_data"]["ticket_uri"],
            'output_uri': outputUri, 
            'violation_actual': result["violation_actual"], 
            'violation_predicted': result["output"]["violation_predicted"],
            'summary_original': summary_org,
            'length_original': len(summary_org),
            'summary_old': result["output"]["summary_old"],
            'length_old': len(result["output"]["summary_old"]), 
            'summary_new': result["output"]["summary_new"],
            'length_new': len(result["output"]["summary_new"]), 
            'correction_in_range': in_range
            }

        evalDF.loc[len(evalDF)]=new_row

    # Save data to csv
    evalDF.to_csv(directory + "evaluatedSummarys.csv", index=False)


### 0-Shot

In [None]:
directory = "./evaluation/summary/gpt-4-0125-preview/0Shot/"

#### Create csv

In [None]:
createEvaluationCSV(directory)

#### Detection

In [None]:
evalDF = pd.read_csv(directory + "evaluatedSummarys.csv")
evalDF.head(3)

In [None]:
actual = np.array([])
predicted = np.array([])

evalDF_runs = evalDF[evalDF["reruns"] == 0]

for index, row in evalDF_runs.iterrows():

    actual = np.append(actual, row["violation_actual"])
    predicted = np.append(predicted, row["violation_predicted"])

In [None]:
accuracy = metrics.accuracy_score(actual, predicted)
precision = metrics.precision_score(actual, predicted, pos_label=True)
recall = metrics.recall_score(actual, predicted, pos_label=True)
f05 = metrics.fbeta_score(actual, predicted, beta=0.5, pos_label=True)

confusion_matrix = metrics.confusion_matrix(actual, predicted, labels=[True, False])
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ["Smell", "No Smell"])

cm_display.plot()
plt.show()

## Oben Links = TP (Verstoß wird erkannt und liegt vor)
## Unten Rechts = TN (Verstoß wird nicht erkannt und liegt nicht vor)
## Oben Rechts = FN (Verstoß wird nicht erkannt obwohl einer vorliegt)
## Unten Links = FP (Verstoß wird erkannt obwohl keiner vorliegt)

In [None]:
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F.5: " + str(f05))

#### Correction

In [None]:
allTrue = evalDF_runs["correction_in_range"].value_counts()[True]
allFalse = evalDF_runs["correction_in_range"].value_counts()[False]

# correctedTickets = evalDF_runs[evalDF_runs["violation_actual"] == "TRUE"]
successRate = evalDF_runs["correction_in_range"].value_counts(normalize=True)[True]

print("# True: " + str(allTrue))
print("# False: " + str(allFalse))
print("Success rate: " + str(successRate))

In [None]:
reference = np.array([])
new_summary = np.array([])

for index, row in evalDF.iterrows():

    if row["violation_actual"] == True:
        reference = np.append(reference, row["summary_original"])
        new_summary = np.append(new_summary, row["summary_new"])

print("Average ROUGE scores:")
total_scores = computeROUGE(reference, new_summary)
print(total_scores)

### Few-Shot

In [None]:
directory = "./evaluation/summary/gpt-4-0125-preview/FewShot/"


#### Create csv

In [None]:
createEvaluationCSV(directory)

#### Detection

In [None]:
evalDF = pd.read_csv(directory + "evaluatedSummarys.csv")

In [None]:
actual = np.array([])
predicted = np.array([])

evalDF_runs = evalDF[evalDF["reruns"] <= 0]

for index, row in evalDF_runs.iterrows():

    actual = np.append(actual, row["violation_actual"])
    predicted = np.append(predicted, row["violation_predicted"])

In [None]:
accuracy = metrics.accuracy_score(actual, predicted)
precision = metrics.precision_score(actual, predicted, pos_label=True)
recall = metrics.recall_score(actual, predicted, pos_label=True)
f05 = metrics.fbeta_score(actual, predicted, beta=0.5, pos_label=True)

confusion_matrix = metrics.confusion_matrix(actual, predicted, labels=[True, False])
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ["Smell", "No Smell"])

cm_display.plot()
plt.show()

In [None]:
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F.5: " + str(f05))

#### Correction

In [None]:
allTrue = evalDF_runs["correction_in_range"].value_counts()[True]
allFalse = evalDF_runs["correction_in_range"].value_counts()[False]

# correctedTickets = evalDF_runs[evalDF_runs["violation_actual"] == "TRUE"]
successRate = evalDF_runs["correction_in_range"].value_counts(normalize=True)[True]

print("# True: " + str(allTrue))
print("# False: " + str(allFalse))
print("Success rate: " + str(successRate))

In [None]:
reference = np.array([])
new_summary = np.array([])

for index, row in evalDF.iterrows():

    if row["violation_actual"] == True:
        reference = np.append(reference, row["summary_original"])
        new_summary = np.append(new_summary, row["summary_new"])

print("Average ROUGE scores (Sum):")
total_scores = computeROUGE(reference, new_summary)
print(total_scores)

### 0-Shot CoT

In [None]:
directory = "./evaluation/summary/gpt-4-0125-preview/0ShotCoT/"

#### Create csv

In [None]:
createEvaluationCSV(directory)

#### Detection

In [None]:
evalDF = pd.read_csv(directory + "evaluatedSummarys.csv")

In [None]:
actual = np.array([])
predicted = np.array([])

evalDF_runs = evalDF[evalDF["reruns"] <= 0]

for index, row in evalDF_runs.iterrows():

    actual = np.append(actual, row["violation_actual"])
    predicted = np.append(predicted, row["violation_predicted"])

In [None]:
accuracy = metrics.accuracy_score(actual, predicted)
precision = metrics.precision_score(actual, predicted, pos_label=True)
recall = metrics.recall_score(actual, predicted, pos_label=True)
f05 = metrics.fbeta_score(actual, predicted, beta=0.5, pos_label=True)

confusion_matrix = metrics.confusion_matrix(actual, predicted, labels=[True, False])
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ["Smell", "No Smell"])

cm_display.plot()
plt.show()

In [None]:
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F.5: " + str(f05))

#### Correction

In [None]:
allTrue = evalDF_runs["correction_in_range"].value_counts()[True]
allFalse = evalDF_runs["correction_in_range"].value_counts()[False]

# correctedTickets = evalDF_runs[evalDF_runs["violation_actual"] == "TRUE"]
successRate = evalDF_runs["correction_in_range"].value_counts(normalize=True)[True]

print("# True: " + str(allTrue))
print("# False: " + str(allFalse))
print("Success rate: " + str(successRate))

In [None]:
reference = np.array([])
new_summary = np.array([])

for index, row in evalDF.iterrows():

    if row["violation_actual"] == True:
        reference = np.append(reference, row["summary_old"])
        new_summary = np.append(new_summary, row["summary_new"])

print("Average ROUGE scores (Sum):")
total_scores = computeROUGE(reference, new_summary)
print(total_scores)

# Update

In [None]:
directory = "./evaluation/update/gpt-4-0125-preview/"

In [None]:
evalDF = pd.DataFrame(columns=["jira", "ticketId", "evolution", "reruns", "ticket_uri", "output_uri", "violation_actual", "violation_predicted", "change_actual", "change_predicted", "success"])

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    outputUri = directory + filename

    if not filename.endswith(".json"):
        continue

    with open(outputUri) as f:
        result = json.load(f)

    new_row = {
        'jira': result["input_data"]["jira"], 
        'ticketId': result["input_data"]["id"], 
        'evolution': result["input_data"]["evolution"],
        'reruns': result["reruns"],
        'ticket_uri': result["input_data"]["ticket_uri"],
        'output_uri': outputUri, 
        'violation_actual': result["violation_actual"], 
        'violation_predicted': result["output"]["violation_predicted"],
        'change_actual': result["reason"],
        'change_predicted': result["output"]["fields"],
        'success': None
        }

    evalDF.loc[len(evalDF)]=new_row    

# Save data to csv
evalDF.to_csv(directory + "updateSummarys.csv", index=False)

#### Detection

In [None]:
actual = np.array([])
predicted = np.array([])

evalDF_runs = evalDF[evalDF["reruns"] <= 1]

for index, row in evalDF_runs.iterrows():

    actual = np.append(actual, row["violation_actual"])
    predicted = np.append(predicted, row["violation_predicted"])

In [None]:
accuracy = metrics.accuracy_score(actual, predicted)
precision = metrics.precision_score(actual, predicted, pos_label="TRUE")
recall = metrics.recall_score(actual, predicted, pos_label="TRUE")
f05 = metrics.fbeta_score(actual, predicted, beta=0.5, pos_label="TRUE")

confusion_matrix = metrics.confusion_matrix(actual, predicted, labels=["TRUE", "FALSE"])
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ["TRUE", "FALSE"])

cm_display.plot()
plt.show()

In [None]:
print("Accuracy: " + str(accuracy))
print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F05: " + str(f05))

#### Correction

In [None]:
evalDF = pd.read_csv(directory + "updateSummarys_labeled.csv")

In [None]:
successRate = evalDF["success"].value_counts(normalize=True)[True]
print("Success rate: " + str(successRate))