In [1]:
from roofline_survey_utils import *

from sklearn.metrics import confusion_matrix, classification_report
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder
import re

import glob

Autogen version: 0.4.9.1


In [2]:
# let's read in the CSV file

#resultsCSV = 
#
#df = pd.read_csv(resultsCSV, quotechar='\"')
#
## let's clean the responses
#
#df['isBB'] = df['answer'].apply(lambda x: True if x == 'Bandwidth' else False)
#
#df['cleanResponse'] = df['llmResponse'].apply(cleanup_responses)
#
#df['isLLMCorrect'] = df['answer'] == df['cleanResponse']
#
## add some columns for ease-of-calculations
#df['actual'] = df['answer'].apply(lambda x: 1 if x == 'Bandwidth' else 2)
#df['predicted'] = df['cleanResponse'].apply(lambda x: 1 if x == 'Bandwidth' else 2)

In [3]:
#print(df.dtypes)

In [4]:

def cleanup_responses(x):
    #print('input:', x)
    if not (str(x) == '<NA>'):
        matches = re.finditer(r'([bB]andwidth|[cC]ompute)', x, re.MULTILINE)
        matches = [m for m in matches]
        if len(matches) > 1:
            # just take the last match
            print('\tMore than 1 match, taking last one!')
            matches = [matches[-1]]
        else:
            assert len(matches) == 1
        for match in matches:
            m = match.group()
            return m.title()

    print(f'returning NA for [{x}]')
    assert False, "this should never be reached!"
    return 'NA'

In [5]:
def calc_metrics_of_df(df):
    accuracy = accuracy_score(df['actual'], df['predicted'])
    f1 = f1_score(df['actual'], df['predicted'], average='macro')
    mcc = matthews_corrcoef(df['actual'], df['predicted'])
    return (accuracy, f1, mcc)

In [6]:
def read_results_csv(csvFile):
    useCOT = False
    hasLogProbs = False
    if 'COT-' in csvFile:
        useCOT = True
    else:
        assert 'simple-' in csvFile, "Don't recognize input CSV file"

    if '-withLogProbs-' in csvFile:
        hasLogProbs = True

    regex = r"(?<=-inference-results-).*(?=.csv)"

    matches = re.finditer(regex, csvFile, re.MULTILINE)

    matches = [match for match in matches]
    assert len(matches) == 1

    modelName =  matches[0].group()
    #modelName = csvFile[len(f"{'COT' if useCOT else 'simple'}-{'withLogProbs' if hasLogProbs else ''}-inference-results-"):-4]

    df = pd.read_csv(csvFile, quotechar='\"')
    
    # let's just drop the failed cases for now
    df = df.dropna(subset=['llmResponse'])

    # do some response cleanup for returned strings that have more than 1 token
    df['llmResponse'] = df['llmResponse'].apply(cleanup_responses)
    # check if the LLM produced the correct answer
    df['isLLMCorrect'] = df.apply(lambda x: x['answer'] == x['llmResponse'], axis=1)
    
    return (df, modelName, useCOT, hasLogProbs)

In [None]:

def calculate_metrics(csvFiles):

    dfStats = pd.DataFrame()

    # each CSV file will get it's own row in the stats table
    for csvName in csvFiles:
        print('Calculating metrics for ', csvName)
        csvDF, modelName, useCOT, hasLogProbs= read_results_csv(csvName)
        print('Read CSV complete for model:', modelName)

        summDict = {}
        summDict['Model Name'] = [modelName]
        summDict['Uses COT'] = [useCOT]
        summDict['Has Log Probs?'] = [hasLogProbs]
        summDict['Number of Samples'] = csvDF.shape[0]


        # add some columns for ease-of-calculations
        csvDF['actual'] = csvDF['answer'].apply(lambda x: 1 if x == 'Bandwidth' else 2)
        csvDF['predicted'] = csvDF['llmResponse'].apply(lambda x: 1 if x == 'Bandwidth' else 2)

        exampleCounts = list(csvDF['numExamples'].unique())

        for numExamples in exampleCounts:
            # get the samples that have this example count
            subDF = csvDF[csvDF['numExamples'] == numExamples].reset_index(drop=True)
            subMetrics = calc_metrics_of_df(subDF)
            summDict[f'{numExamples}-shot (ACC, F1, MCC)'] = f"({round(100.0*subMetrics[0],2)}, {round(100.0*subMetrics[1],2)}, {round(100.0*subMetrics[2],2)})"


        jointMetrics = calc_metrics_of_df(csvDF)

        summDict['Joint Acc'] = round(100.0*jointMetrics[0],2)
        summDict['Joint F1'] = round(100.0*jointMetrics[1],2)
        summDict['Joint MCC'] = round(100.0*jointMetrics[2],2)

        summDF = pd.DataFrame.from_dict(summDict)
        dfStats = pd.concat([dfStats, summDF], ignore_index=True)


    return dfStats



In [8]:

csvFiles = list(glob.glob('./*.csv'))

stats = calculate_metrics(csvFiles)

print(stats)

Calculating metrics for  ./simple-withLogProbs-inference-results-gpt-4o-mini.csv
Read CSV complete for model: gpt-4o-mini
Calculating metrics for  ./COT-withLogProbs-inference-results-gpt-4o-mini.csv
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, taking last one!
	More than 1 match, tak