# Figures and Tables

### Functions

In [None]:
import os
from PIL import Image, ImageDraw, ImageFont

import os
from PIL import Image, ImageDraw, ImageFont
import math

def add_subplot_tags_and_stack(directory, prefix, num_columns=1, give_list_instead=None):
    # List of images that start with the given prefix
    if give_list_instead:
        images=give_list_instead
    else: 
        images = [os.path.join(directory, f) for f in os.listdir(directory) if f.startswith(prefix) and f.endswith('.png')]
        # Sort images if necessary
        images.sort()
    
    # Load images and prepare for tagging
    loaded_images = [Image.open(img_path) for img_path in images]
    
    # Prepare the subplot tags (a, b, c, ...)
    subplot_tags = [chr(97 + i) for i in range(len(loaded_images))]  # 'a' is 97 in ASCII
    
    # List to hold modified images
    tagged_images = []
    
    # Font for subplot label
    try:
        font = ImageFont.truetype("arial.ttf", 16)  # You may need to adjust the path to arial.ttf or use another font
    except IOError:
        font = ImageFont.load_default()
    
    # Process each image
    for image, tag in zip(loaded_images, subplot_tags):
        # Draw the subplot tag on the image
        draw = ImageDraw.Draw(image)
        text = f"({tag})"
        box_size = 25  # Define the size of the box for the subplot label
        draw.rectangle([(0, 0), (box_size, box_size)], outline="black", fill="white")
        draw.text((25, 25), text, font=font, fill="black")
        tagged_images.append(image)
    
    # Determine layout
    rows = math.ceil(len(tagged_images) / num_columns)
    max_width = max(img.width for img in tagged_images)
    max_height = max(img.height for img in tagged_images)

    # Create a new image to stack all tagged images
    total_width = num_columns * max_width
    total_height = rows * max_height
    stacked_image = Image.new('RGB', (total_width, total_height), 'white')
    
    # Paste images into the stacked_image
    x_offset = 0
    y_offset = 0

    for i, img in enumerate(tagged_images):
        stacked_image.paste(img, (x_offset, y_offset))
        x_offset += img.width
        if (i + 1) % num_columns == 0:
            x_offset = 0
            y_offset += img.height
    
    # Save the stacked image
    stacked_image.save(os.path.join("C:\\Users\\LEGION\Documents\\GIT\\LLM_answer_GIBoard\Submit\\Final Figures", f"{prefix}__stacked_image.png"), bbox_inches='tight')
    stacked_image.save(os.path.join("C:\\Users\\LEGION\Documents\\GIT\\LLM_answer_GIBoard\Submit\\Final Figures", f"{prefix}__stacked_image.pdf"), bbox_inches='tight')
    stacked_image.show()
    
# Example usage
#directory = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures"
#prefix = 'E1_Performance_subcategory-'
#num_columns=2
#add_subplot_tags_and_stack(directory, prefix,num_columns)


### E0

##### E0: Function

In [None]:
# Supplemantary Table S4 
E0P0_prompteng_functionvsopen_list=[r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P0-opencall.xlsx",
                                    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P0-functioncall.xlsx",
                                    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P0-Langchain.xlsx",
                                    ]
#analyze_model_accuracy(prompteng_functioncall_raw)
analyze_and_merge_multiple_files(E0P0_prompteng_functionvsopen_list).T

In [None]:
# function
import matplotlib.pyplot as plt

# Data
functions = ['OpenAI Open Call', 'OpenAI Function Call', 'Lang Chain']
gpt3_api_accuracy = [40.00, 43.33, 40.00]
gpt4_api_accuracy = [55.00, 68.33, 66.67]

bar_width = 0.4  # Width of the bars

# Plotting
plt.figure(figsize=(10, 6))

# Plot grid
plt.grid(True, linestyle='--', zorder=0)

# Plot data bars
plt.barh(range(len(functions)), gpt3_api_accuracy, height=bar_width, color='blue', label='GPT3-API', zorder=2)
plt.barh([x + bar_width for x in range(len(functions))], gpt4_api_accuracy, height=bar_width, color='red', label='GPT4-API', zorder=2)

plt.xlabel('Accuracy (%)', fontsize=14)
plt.ylabel('Function', fontsize=14)
plt.title('Accuracy of Different Functions', fontsize=16)
plt.yticks([x + bar_width / 2 for x in range(len(functions))], functions, fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=12)

plt.tight_layout()
plt.show()


##### E0: Prompt Eng

In [None]:
import pandas as pd
import re
from scipy import stats
import numpy as np
from scipy.stats import wilcoxon
from sklearn.utils import resample

def bootstrap_mean_diff_ci(data, reference, num_bootstrap=10000, alpha=0.05):
    boot_diffs = []
    n = len(data)
    for _ in range(num_bootstrap):
        sample_data = resample(data, n_samples=n, random_state=None)
        sample_reference = resample(reference, n_samples=n, random_state=None)
        boot_diffs.append(np.mean(sample_data) - np.mean(sample_reference))
    lower_bound = np.percentile(boot_diffs, (alpha/2)*100)
    upper_bound = np.percentile(boot_diffs, (1 - alpha/2)*100)
    return np.round((lower_bound, upper_bound), 2)


# def average_performance_columns_with_stats(df_acc, experimentname=''):
#     df = df_acc.copy()
#     # Regular expression to extract accuracy and correct answers
#     accuracy_pattern = re.compile(r'(\d+\.?\d*)% \((\d+)-of-(\d+); Error: (\d+)\)')
    
#     # Determine performance columns by excluding 'Model Name'
#     performance_columns = [col for col in df.columns if col != 'Model Name']
    
#     # Initialize new columns for averages, standard deviations, and confidence intervals

#     raw_percents = []
#     # Iterate over the DataFrame to calculate statistics
#     for index, row in df.iterrows():
#         percents = []
#         corrects = []
#         totalanswered_woERRORs = []
#         Errors = []
        
#         # Process each performance column
#         for col in performance_columns:
#             match = accuracy_pattern.search(str(row[col]))
#             if match:
#                 percent, correct, totalanswered_woERROR, Error = match.groups()
#                 percents.append(float(percent))
#                 corrects.append(float(correct))
#                 totalanswered_woERRORs.append(float(totalanswered_woERROR))
#                 Errors.append(float(Error))
        
#         # Calculate averages, standard deviations, and confidence intervals
#         avg_percent = np.round(np.mean(percents), 2)
#         std_dev_percent = np.round(np.std(percents, ddof=1), 2)
#         n = len(percents)
#         if std_dev_percent != 0:
#             ci_percent = stats.norm.interval(0.95, loc=avg_percent, scale=std_dev_percent/np.sqrt(n))
#             lci = np.round(ci_percent[0], 2)
#             uci = np.round(ci_percent[1], 2)
#         else:
#             lci = avg_percent
#             uci = avg_percent
        
#         # Calculate 95% CI of mean difference and p-value
#         ref_percent = percents[0]  # Assuming the first entry is the reference
#         mean_diff = avg_percent - ref_percent
#         se_diff = std_dev_percent / np.sqrt(n)
#         t_stat = mean_diff / se_diff
#         p_value = 2 * (1 - stats.t.cdf(np.abs(t_stat), df=n-1))
        
#         mean_diff_ci = stats.t.interval(0.95, df=n-1, loc=mean_diff, scale=se_diff)
#         mean_diff_ci = np.round(mean_diff_ci, 2)
        
#         # average answered without ERROR and correct answers X-of-Y
#         avg_correct = np.round(np.mean(corrects), 1)
        
#         totalanswered_woERRORs = [float(x) for x in totalanswered_woERRORs]
#         avg_answerdwoerror = np.round(np.mean(totalanswered_woERRORs), 1)
#         avg_Error = np.round(np.mean(Errors), 1)
        
#         # Calculate range of percents
#         minpercent = min(percents)
#         maxpercent = max(percents)
        
#         # Update the DataFrame with the calculated statistics
#         df.at[index, f'{experimentname}_SummaryAcc'] = (
#             f"{avg_percent}±{std_dev_percent} [95CI: {lci}, {uci}] "
#             f"[Range: {minpercent}, {maxpercent}] ({avg_correct}-of-{avg_answerdwoerror}; Error: {avg_Error}) "
#             f"Mean Diff CI: {mean_diff_ci}; p-value={p_value:.3f}   {percents}" 
#         )
#     df = df.drop(columns=performance_columns)
#     return df

def average_performance_columns_with_stats(df_acc, experimentname=''):
    df = df_acc.copy()
    # Regular expression to extract accuracy and correct answers
    accuracy_pattern = re.compile(r'(\d+\.?\d*)% \((\d+)-of-(\d+); Error: (\d+)\)')
    
    # Determine performance columns by excluding 'Model Name'
    performance_columns = [col for col in df.columns if col != 'Model Name']
    
    # Reference percentages for comparison
    reference_percents = [43.33, 45.0, 43.33]
    
    # Iterate over the DataFrame to calculate statistics
    for index, row in df.iterrows():
        percents = []
        corrects = []
        totalanswered_woERRORs = []
        Errors = []
        
        # Process each performance column
        for col in performance_columns:
            match = accuracy_pattern.search(str(row[col]))
            if match:
                percent, correct, totalanswered_woERROR, Error = match.groups()
                percents.append(float(percent))
                corrects.append(float(correct))
                totalanswered_woERRORs.append(float(totalanswered_woERROR))
                Errors.append(float(Error))
        
        # Calculate averages, standard deviations, and confidence intervals
        avg_percent = np.round(np.mean(percents), 2)
        std_dev_percent = np.round(np.std(percents, ddof=1), 2)
        n = len(percents)
        if std_dev_percent != 0:
            ci_percent = stats.norm.interval(0.95, loc=avg_percent, scale=std_dev_percent/np.sqrt(n))
            lci = np.round(ci_percent[0], 2)
            uci = np.round(ci_percent[1], 2)
        else:
            lci = avg_percent
            uci = avg_percent
            
            
        try:
            # Calculate mean difference using bootstrapping
            mean_diff_ci = bootstrap_mean_diff_ci(percents, reference_percents)
            mean_diff = np.round(np.mean(percents) - np.mean(reference_percents), 2)

            # Perform Wilcoxon signed-rank test
            _, p_value = wilcoxon(percents, reference_percents)
        except Exception as e:
            print(f"ERROR in {experimentname}: {e}")
            mean_diff_ci = mean_diff=p_value= 10000
        # Average answered without ERROR and correct answers X-of-Y
        avg_correct = np.round(np.mean(corrects), 1)
        
        totalanswered_woERRORs = [float(x) for x in totalanswered_woERRORs]
        avg_answerdwoerror = np.round(np.mean(totalanswered_woERRORs), 1)
        avg_Error = np.round(np.mean(Errors), 1)
        
        # Calculate range of percents
        minpercent = min(percents)
        maxpercent = max(percents)
        
        # Update the DataFrame with the calculated statistics
        df.at[index, f'{experimentname}_SummaryAcc'] = (
            f"{avg_percent}±{std_dev_percent} [95CI: {lci}, {uci}] [Range: {minpercent}, {maxpercent}] ({avg_correct}-of-{avg_answerdwoerror}; Error: {avg_Error})  {mean_diff} [Mean Diff CI: {mean_diff_ci}] [p-value: {p_value:.4f}]"
        )
    df = df.drop(columns=performance_columns)
    return df


def analyze_model_accuracy(excel_path):
    # Load the Excel file
    df = pd.read_excel(excel_path)
    
    # Filter columns that end with '_correctness'
    correctness_columns = [col for col in df.columns if col.endswith('_correctness')]
    
    # Prepare a dictionary to hold results
    results = {
        'Model Name': [],
        'Oall_Accu': []
    }
    
    # Analyze each model's performance
    for col in correctness_columns:
        # Extract the model name (the first component of the column name)
        model_name = col.split('_')[0]
        
        # Count correct and incorrect answers
        correct_count = df[col].value_counts().get('correct', 0)
        incorrect_count = df[col].value_counts().get('incorrect', 0)
        total = df.shape[0]
        total_answered = correct_count + incorrect_count
        
        # Calculate Oall_Accu
        accuracy = (correct_count / total) * 100
        
        
        # Format the results
        accuracy_str = f'{accuracy:.2f}% ({correct_count}-of-{total_answered}; Error: {total-total_answered})'
        
        # Append the results
        results['Model Name'].append(model_name)
        results['Oall_Accu'].append(accuracy_str)

        
    # Convert results dictionary to a DataFrame for nicer display
    results_df = pd.DataFrame(results)
    
    # Display the results table
    #print(results_df)
    
    return results_df

import pandas as pd
import os

# Assuming the analyze_model_accuracy function is defined as given previously

def analyze_and_merge_multiple_files(file_paths):
    # Initialize an empty DataFrame for the final merged results
    final_results = pd.DataFrame()
    
    for file_path in file_paths:
        # Get the file name without extension for prefixing
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        
        # Generate the accuracy table for this file
        accuracy_table = analyze_model_accuracy(file_path)
        
        # Rename columns except for 'Model Name' to include the file name prefix
        accuracy_table.rename(columns=lambda x: f"{file_name}>>>{x}" if x != "Model Name" else x, inplace=True)
        
        
        # Merge the current table with the final_results table
        if final_results.empty:
            final_results = accuracy_table
        else:
            # Merge on 'Model Name', keeping all model names encountered so far
            final_results = pd.merge(final_results, accuracy_table, on="Model Name", how="outer")
    
    # Display the final merged table
    #print(final_results)
    
    return final_results

# Example usage:


#prompteng_functioncall_raw=r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P4-wseed-temp1.8-3.xlsx"
#prompteng_opencall_raw=r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P4-wseed-temp1.8-2.xlsx"
#prompteng_list=[prompteng_functioncall_raw,prompteng_opencall_raw]
#analyze_model_accuracy(prompteng_functioncall_raw)
#example = analyze_and_merge_multiple_files(prompteng_list)




In [None]:
all_table_performances

In [None]:
All_experiments_dic={
    'raw':[r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P0-functioncall.xlsx",
    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P0-functioncall2.xlsx",
    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\E0P0-functioncall3.xlsx",],
    
    'DirectQuestioning':[r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-DirectQuestioning.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-DirectQuestioning-2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-DirectQuestioning.xlsx",],
    
    'OptionAnalysis':[r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-OptionAnalysis.xlsx",
                         r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-OptionAnalysis-2.xlsx",
                         r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-OptionAnalysis.xlsx",],
    
    'ChainofThought': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ChainofThought.xlsx",
                       r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ChainofThought-2.xlsx",
                       r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-ChainofThought.xlsx",],
    
    'AnswerandJustify': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-AnswerandJustify.xlsx",
                         r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-AnswerandJustify-2.xlsx",
                         r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-AnswerandJustify.xlsx",],
    
    'EliminationMethod': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-EliminationMethod.xlsx",
                          r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-EliminationMethod-2.xlsx",
                          r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-EliminationMethod.xlsx",],
    
    'ComparativeAnalysis': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ComparativeAnalysis.xlsx",
                            r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ComparativeAnalysis-2.xlsx",
                            r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-ComparativeAnalysis.xlsx",],
    
    'ContextualEmbedding': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ContextualEmbedding.xlsx",
                            r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ContextualEmbedding-2.xlsx",
                            r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-ContextualEmbedding.xlsx",],
    
    'ConfidenceScoring': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ConfidenceScoring.xlsx",
                          r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ConfidenceScoring-2.xlsx",
                          r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ConfidenceScoring.xlsx",],
    
    'ExpertMimicry': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ExpertMimicry.xlsx",
                      r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ExpertMimicry-2.xlsx",
                      r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-ExpertMimicry.xlsx",],
    
    'ConsensusTechnique': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ConsensusTechnique.xlsx",
                           r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-ConsensusTechnique-2.xlsx",
                           r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-ConsensusTechnique.xlsx",],
    
    'GiveModelTimetoThink': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-GiveModelTimetoThink.xlsx",
                             r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT512-GiveModelTimetoThink-2.xlsx",
                             r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P1-MT1024-GiveModelTimetoThink.xlsx"],
    
    'best_prompt_1024': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best1.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best2.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best3.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best4.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best5.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best6.xlsx",],

    'best_prompt_512': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best1.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best2.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best3.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best4.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best5.xlsx",
                    r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best6.xlsx",],
    
    'best_prompt1': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best1.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best1-2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best1.xlsx"],
    
    'best_prompt2': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best2-2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best2.xlsx",],
    
    'best_prompt3': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best3.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best3-2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best3.xlsx",],
    
    'best_prompt4': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best4.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best4-2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best4.xlsx",],
    
    'best_prompt5': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best5.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best5-2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best5.xlsx",],
    
    'best_prompt6': [r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best6.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT512candid_best6-2.xlsx",
                     r"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P2-MT1024candid_best6.xlsx",],
    }


allexperiments_list=[]
for value in All_experiments_dic.values():
    allexperiments_list=allexperiments_list+value

allexperiments = analyze_and_merge_multiple_files(allexperiments_list).T
allexperiments.columns = allexperiments.iloc[0]
allexperiments=allexperiments.drop('Model Name')
allexperiments=allexperiments.reset_index()

def extract_numbers(s):
    # Regular expression patterns to match floating point numbers (including integers)
    pattern = r"[-+]?[0-9]*\.?[0-9]+"
    matches = re.findall(pattern, s)
    
    # Convert matches to floats if they exist, otherwise return None
    avg = float(matches[0]) if matches else None
    sd = float(matches[1]) if len(matches) > 1 else None
    lci = float(matches[2]) if len(matches) > 2 else None
    uci = float(matches[3]) if len(matches) > 3 else None
    
    return avg, sd, lci, uci

all_table_performances=[]
Average_Tables=[]
for experiment, experimentlist in All_experiments_dic.items():
    table_performances=analyze_and_merge_multiple_files(experimentlist)
    all_table_performances.append(table_performances)
    table_average=average_performance_columns_with_stats(table_performances, experimentname=experiment)
    table_averageT=table_average.T
    table_averageF=table_averageT.drop('Model Name')
    Average_Tables.append(table_averageF)
    
allaveragetables=pd.concat(Average_Tables)
allaveragetables.columns=['GPT3.5 Accuracy', 'GPT4 Accuracy']


for index,row in  allaveragetables.iterrows(): 
    avg,sd,lci,uci = extract_numbers(row['GPT3.5 Accuracy'])
    if avg>45:
        improvment=avg-43.89
        allaveragetables.at[index, 'Change']=float(improvment)
        allaveragetables.at[index, 'Higherthan95UCI']='Yes'
    else:
        improvment=avg-43.89
        allaveragetables.at[index, 'Change']=float(improvment)
        allaveragetables.at[index, 'Higherthan95UCI']='No'

pd.set_option('display.max_colwidth', 150)  
allaveragetables.drop(columns=['GPT4 Accuracy'])





In [None]:
import re

def extract_values_avgrange(s):
    # Regular expression patterns to match numerical values
    avg_pattern = r'(\d+\.\d+)'  # Pattern for average value
    range_pattern = r'Range: (\d+\.\d+), (\d+\.\d+)'  # Pattern for range value
    
    # Find average value
    avg_match = re.search(avg_pattern, s)
    avg = float(avg_match.group(1)) if avg_match else None
    
    # Find range values
    range_match = re.search(range_pattern, s)
    range_start = float(range_match.group(1)) if range_match else None
    range_end = float(range_match.group(2)) if range_match else None
    
    return avg, range_start, range_end

allaveragetables=allaveragetables.drop('best_prompt_1024_SummaryAcc')
allaveragetables=allaveragetables.drop('best_prompt_512_SummaryAcc')
allaveragetables


index_to_label_dic= {'raw_SummaryAcc': 'Raw', 'DirectQuestioning_SummaryAcc': 'Direct Questioning',
                    'OptionAnalysis_SummaryAcc':'Option Analysis', 'ChainofThought_SummaryAcc':'Chain of Thought',
                    'AnswerandJustify_SummaryAcc':'Answer and Justify', 'EliminationMethod_SummaryAcc':'Elimination Method',
                    'ComparativeAnalysis_SummaryAcc': 'Comparative Analysis', 'ContextualEmbedding_SummaryAcc': 'Contextual Embedding',
                    'ConfidenceScoring_SummaryAcc': 'Confidence Scoring', 'ExpertMimicry_SummaryAcc':'Expert Mimicry',
                    'ConsensusTechnique_SummaryAcc':'Consensus Technique', 'GiveModelTimetoThink_SummaryAcc':'Give Model Time to Think',
                    'best_prompt1_SummaryAcc':'Candidate 1', 'best_prompt2_SummaryAcc':'Candidate 2',
                    'best_prompt3_SummaryAcc':'Candidate 3', 'best_prompt4_SummaryAcc':'Candidate 4',
                    'best_prompt5_SummaryAcc':'Candidate 5', 'best_prompt6_SummaryAcc':'Candidate 6'}


accuracy_means=[]
Lranges=[]
Uranges=[]
Labels=[]

for index, row in allaveragetables.iterrows():
    avg, lrange, urange = extract_values_avgrange(row['GPT3.5 Accuracy'])
    accuracy_means.append(avg)
    Lranges.append(lrange)
    Uranges.append(urange)
    Labels.append(index_to_label_dic[str(index)])
    
if len(accuracy_means) == len(Lranges) == len(Uranges) == len(Labels):
    print(f'Means: {accuracy_means}')
    print(f'Lower ranges: {Lranges}')
    print(f'Upper ranges: {Uranges}')
    print(f'Labels: {Labels}')
    print ('   ✓ ---> All lists have the same length.')

In [None]:
Uranges

In [None]:
# prompt engineering
import matplotlib.pyplot as plt
import numpy as np




# Plotting
plt.figure(figsize=(10, 8))

for i in range(len(Labels)):
    plt.plot([Lranges[i], Uranges[i]], [i, i], color='blue')  # Plotting horizontal lines
    plt.plot([accuracy_means[i]], [i], marker='o', markersize=8, color='blue')  # Plotting mean points

plt.xlabel('Accuracy (%)', fontsize=14)
plt.ylabel('Function', fontsize=14)
plt.title('Accuracy of Different Prompt Engineering Techniques', fontsize=16)
plt.yticks(range(len(Labels)), Labels, fontsize=12)
plt.xticks(fontsize=12)
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# max token
import matplotlib.pyplot as plt
import numpy as np

# Data
experiment_settings = [
    'Raw', 'Direct Questioning', 'Option Analysis', 'Chain of Thought', 
    'Answer and Justify', 'Elimination Method', 'Comparative Analysis', 
    'Contextual Embedding', 'Confidence Scoring', 'Expert Mimicry', 
    'Consensus Technique', 'Give Model Time to Think', 'Best Prompt (candidate 4)'
]

max_token_settings = ['Max_token = 512', 'Max_token = 1024', 'Max_token = 512plus']

accuracy_raw = [43.33, 50.00, 45.00, 52.54, 48.33, 41.67, 41.67, 48.33, 46.67, 45.00, 38.33, 41.67, 46.67]
accuracy_512 = [41.67, 48.33, 43.33, 45.00, 50.00, 46.67, 41.67, 43.33, 53.33, 45.00, 43.33, 46.67, 45.00]
accuracy_1024 = [43.33, 48.33, 43.33, 45.00, 50.00, 46.67, 41.67, 43.33, 53.33, 45.00, 43.33, 46.67, 45.00]

x = np.arange(len(experiment_settings))
bar_width = 0.25

# Plotting
plt.figure(figsize=(12, 8))

for i, setting in enumerate(max_token_settings):
    plt.barh(x - (bar_width * i), accuracy_raw if i == 0 else (accuracy_512 if i == 1 else accuracy_1024), 
             height=bar_width, label=setting)

plt.xlabel('Accuracy (%)', fontsize=14)
plt.ylabel('Experiment Setting', fontsize=14)
plt.title('Accuracy of Different Experiment Settings with Varying Max_token Lengths', fontsize=16)
plt.yticks(x, experiment_settings, rotation=0, fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


### E0: all in one

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define colors for each model
gpt3_color = '#1f77b4'  # Blue
gpt4_color = '#d62728'  # Red
light_blue = '#add8e6'   # Light blue
dark_blue = '#00008b'    # Dark blue
teal = '#008080'         # Teal
arctic = '#a3e4d7'       # Arctic

# Figure 1: Accuracy of Different Functions
plt.figure(figsize=(12, 5))
plt.grid(True, linestyle='--', linewidth=0.5)

functions = ['OpenAI Open Call', 'OpenAI Function Call', 'Lang Chain']
gpt3_api_accuracy = [40.00, 43.33, 40.00]
gpt4_api_accuracy = [55.00, 68.33, 66.67]

bar_width = 0.4  # Width of the bars

plt.barh(np.arange(len(functions)), gpt3_api_accuracy, height=bar_width, color=gpt3_color, label='GPT3-API', zorder=2)
plt.barh(np.arange(len(functions)) + bar_width, gpt4_api_accuracy, height=bar_width, color=gpt4_color, label='GPT4-API', zorder=2)

plt.xlabel('Accuracy (%)', fontsize=14)
plt.ylabel('Function', fontsize=14)
plt.title('Accuracy of Different Functions', fontsize=16)
plt.yticks(np.arange(len(functions)) + bar_width / 2, functions, fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=12)
plt.tight_layout()

# Figure 2: Accuracy of Different Prompt Engineering Techniques
plt.figure(figsize=(12, 13.5))

# Data
functions = ['Raw', 'Direct Questioning', 'Option Analysis', 'Chain of Thought', 'Answer and Justify', 
             'Elimination Method', 'Comparative Analysis', 'Contextual Embedding', 'Confidence Scoring',
             'Expert Mimicry', 'Consensus Technique', 'Give Model Time to Think', 'Candidate 1', 'Candidate 2', 'Candidate 3', 'Candidate 4', 'Candidate 5', 'Candidate 6']
accuracy_means = [43.89, 48.33, 43.89, 47.5, 48.33, 44.17, 42.22, 46.66, 45.56, 45.56, 40.0, 43.89, 38.33, 48.33, 43.89, 48.89, 47.78, 48.89]
accuracy_errors = [0.96, 1.67 , 0.96 , 3.54, 1.67 , 3.54 , 0.96 , 2.89 , 1.93 , 0.96 , 2.89, 2.55 , 3.34 ,  0 , 0.96 , 3.47 , 1.92 , 0]

plt.errorbar(accuracy_means, np.arange(len(functions)), xerr=accuracy_errors, fmt='o', markersize=8, capsize=5, capthick=2, color=gpt3_color)

plt.xlabel('Accuracy (%)', fontsize=14)
plt.ylabel('Function', fontsize=14)
plt.title('Accuracy of Different Prompt Engineering Techniques', fontsize=16)
plt.yticks(np.arange(len(functions)), functions, fontsize=12)
plt.xticks(fontsize=12)
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()

# Figure 3: Accuracy of Different Experiment Settings with Varying Max_token Lengths
plt.figure(figsize=(12, 13.5))

experiment_settings = [
    'Raw', 'Direct Questioning', 'Option Analysis', 'Chain of Thought', 
    'Answer and Justify', 'Elimination Method', 'Comparative Analysis', 
    'Contextual Embedding', 'Confidence Scoring', 'Expert Mimicry', 
    'Consensus Technique', 'Give Model Time to Think', 'Best Prompt (candidate 4)'
]

max_token_settings = ['Max_token = 512', 'Max_token = 1024', 'Max_token = 512plus']

accuracy_raw = [43.33, 50.00, 45.00, 52.54, 48.33, 41.67, 41.67, 48.33, 46.67, 45.00, 38.33, 41.67, 46.67]
accuracy_512 = [41.67, 48.33, 43.33, 45.00, 50.00, 46.67, 41.67, 43.33, 53.33, 45.00, 43.33, 46.67, 45.00]
accuracy_1024 = [43.33, 48.33, 43.33, 45.00, 50.00, 46.67, 41.67, 43.33, 53.33, 45.00, 43.33, 46.67, 45.00]

x = np.arange(len(experiment_settings))
bar_width = 0.25

# Define colors for each bar in the third figure
colors_third_figure = [gpt3_color, light_blue, dark_blue]

for i, setting in enumerate(max_token_settings):
    plt.barh(x - (bar_width * i), accuracy_raw if i == 0 else (accuracy_512 if i == 1 else accuracy_1024), 
             height=bar_width, label=setting, color=colors_third_figure[i])

plt.xlabel('Accuracy (%)', fontsize=14)
plt.ylabel('Experiment Setting', fontsize=14)
plt.title('Accuracy of Different Experiment Settings with Varying Max_token Lengths', fontsize=16)
plt.yticks(x, experiment_settings, rotation=0, fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=12)
plt.grid(axis='x', linestyle='--', linewidth=0.5)
plt.tight_layout()

# Figure 4: Accuracy of Different Prompt Settings at Various Temperatures
plt.figure(figsize=(12, 5))

temperatures = ['Temperature = 0', 'Temperature = 0.1', 'Temperature = 0.4', 'Temperature = 0.6', 'Temperature = 0.9', 'Temperature = 1']
experiment_settings = ['GPT3.5-API with best prompt', 'GPT3.5-API with raw prompt']

accuracy_best_prompt = [45.56, 46.11, 45.55, 42.78, 45.0, 47.78]
accuracy_raw_prompt = [43.89, 45.56, 45.0, 42.78, 45.0, 48.33]

ci_best_prompt = [(44.47, 46.65), (43.93, 48.29), (42.67, 48.43), (38.85, 46.71), (38.46, 51.53), (44.9, 50.66)]
ci_raw_prompt = [(42.8, 44.98), (44.47, 46.65), (43.11, 46.89), (41.69, 43.86), (41.74, 48.27), (45.07, 51.6)]

x = np.arange(len(temperatures))

# Define colors for each line in the fourth figure
colors_fourth_figure = [gpt3_color, arctic]

for i, setting in enumerate(experiment_settings):
    plt.errorbar(accuracy_best_prompt if i == 0 else accuracy_raw_prompt, x, xerr=np.abs(np.array(ci_best_prompt if i == 0 else ci_raw_prompt).T - np.array(accuracy_best_prompt if i == 0 else accuracy_raw_prompt)), fmt='o', label=setting, capsize=5, capthick=2, color=colors_fourth_figure[i], alpha=0.85)

plt.xlabel('Accuracy (%)', fontsize=14)
plt.ylabel('Temperature', fontsize=14)
plt.title('Accuracy of Different Prompt Settings at Various Temperatures', fontsize=16)
plt.yticks(x, temperatures, fontsize=12)
plt.xticks(fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()

# Show all figures
plt.show()


In [None]:
from altair import Padding
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Rectangle

# Define colors for each model
gpt3_color = '#1f77b4'  # Blue
gpt4_color = '#ffa500'  # orange
light_blue = '#add8e6'   # Light blue
dark_blue = '#00008b'    # Dark blue
teal = '#008080'         # Teal
arctic = '#a3e4d7'       # Arctic

# Create a figure and four subplots
fig, axs = plt.subplots(2, 2, figsize=(20, 15.5), gridspec_kw={'width_ratios': [1, 1], 'height_ratios': [5, 13.5]})

# Adjust space between subplots
plt.subplots_adjust(hspace=0.3, wspace=0.52)


# Function to add rectangle with label
def add_rectangle(ax, label):
    ax.text(-0.475, 1, label, transform=ax.transAxes, fontsize=30, fontweight='bold',
        bbox=dict(facecolor='white', edgecolor='black', boxstyle='square,pad=0.15'))

#----------------------------------------------------
# Figure 1: Accuracy of Different Functions
axs[0, 0].set_facecolor('lightgrey')
axs[0, 0].grid(True, linestyle='--', linewidth=1, color='white')
axs[0, 0].spines['top'].set_visible(False)
axs[0, 0].spines['right'].set_visible(False)
add_rectangle(axs[0, 0], 'a')

functions = ['OpenAI Open Call', 'OpenAI Function Call', 'Lang Chain']
gpt3_api_accuracy = [40.00, 43.33, 40.00]
gpt4_api_accuracy = [55.00, 68.33, 66.67]

bar_width = 0.3  # Width of the bars

axs[0, 0].barh(np.arange(len(functions)), gpt3_api_accuracy, height=bar_width, color=gpt3_color, label='GPT3-API', zorder=2)
axs[0, 0].barh(np.arange(len(functions)) + bar_width, gpt4_api_accuracy, height=bar_width, color=gpt4_color, label='GPT4-API', zorder=2)

axs[0, 0].set_xlabel('Accuracy (%)', fontsize=14)
axs[0, 0].set_ylabel('Function', fontsize=14)
axs[0, 0].set_title('Accuracy of Different Functions', fontsize=16)
axs[0, 0].set_yticks(np.arange(len(functions)) + bar_width / 2)
axs[0, 0].set_yticklabels(functions, fontsize=12)
axs[0, 0].tick_params(axis='x', labelsize=12)
axs[0, 0].legend(fontsize=10)

#-----------------------------------------------------------------------------------
# Figure 2: Accuracy of Different Prompt Engineering Techniques
axs[1, 1].set_facecolor('lightgrey')
axs[1, 1].grid(True, linestyle='--', linewidth=1, color='white')
axs[1, 1].spines['top'].set_visible(False)
axs[1, 1].spines['right'].set_visible(False)

add_rectangle(axs[1, 1], 'd')

functions = ['Raw', 'Direct Questioning', 'Option Analysis', 'Chain of Thought', 'Answer and Justify', 
             'Elimination Method', 'Comparative Analysis', 'Contextual Embedding', 'Confidence Scoring',
             'Expert Mimicry', 'Consensus Technique', 'Give Model Time to Think', 'Candidate 1', 'Candidate 2', 'Candidate 3', 'Candidate 4', 'Candidate 5', 'Candidate 6']
accuracy_means = [43.89, 48.33, 43.89, 47.5, 48.33, 44.17, 42.22, 46.66, 45.56, 45.56, 40.0, 43.89, 38.33, 48.33, 43.89, 48.89, 47.78, 48.89]
accuracy_errors = [0.96, 1.67 , 0.96 , 3.54, 1.67 , 3.54 , 0.96 , 2.89 , 1.93 , 0.96 , 2.89, 2.55 , 3.34 ,  0 , 0.96 , 3.47 , 1.92 , 0]

axs[1, 1].errorbar(accuracy_means, np.arange(len(functions)), xerr=accuracy_errors, fmt='o', markersize=8, capsize=5, capthick=2, color=gpt3_color)

axs[1, 1].set_xlabel('Accuracy (%)', fontsize=14)
axs[1, 1].set_ylabel('Function', fontsize=14)
axs[1, 1].set_title('Accuracy of Different Prompt Engineering Techniques', fontsize=16)
axs[1, 1].set_yticks(np.arange(len(functions)))
axs[1, 1].set_yticklabels(functions, fontsize=12)
axs[1, 1].tick_params(axis='x', labelsize=12)
axs[1, 1].grid(True, linestyle='--', linewidth=0.5)

#-------------------------------------------------------------------------------------------------
# Figure 3: Accuracy of Different Experiment Settings with Varying Max_token Lengths
axs[1, 0].set_facecolor('lightgrey')
axs[1, 0].grid(True, linestyle='--', linewidth=1, color='white')
axs[1, 0].spines['top'].set_visible(False)
axs[1, 0].spines['right'].set_visible(False)

add_rectangle(axs[1, 0], 'c')

experiment_settings = [
    'Raw', 'Direct Questioning', 'Option Analysis', 'Chain of Thought', 
    'Answer and Justify', 'Elimination Method', 'Comparative Analysis', 
    'Contextual Embedding', 'Confidence Scoring', 'Expert Mimicry', 
    'Consensus Technique', 'Give Model Time to Think', 'Best Prompt (candidate 4)'
]

max_token_settings = ['MT: 512', 'MT: 1024', 'MT: 512plus']

accuracy_raw = [43.33, 50.00, 45.00, 52.54, 48.33, 41.67, 41.67, 48.33, 46.67, 45.00, 38.33, 41.67, 46.67]
accuracy_512 = [41.67, 48.33, 43.33, 45.00, 50.00, 46.67, 41.67, 43.33, 53.33, 45.00, 43.33, 46.67, 45.00]
accuracy_1024 = [43.33, 48.33, 43.33, 45.00, 50.00, 46.67, 41.67, 43.33, 53.33, 45.00, 43.33, 46.67, 45.00]

x = np.arange(len(experiment_settings))
bar_width = 0.25

# Define colors for each bar in the third figure
colors_third_figure = [gpt3_color, light_blue, dark_blue]

for i, setting in enumerate(max_token_settings):
    axs[1, 0].barh(x - (bar_width * i), accuracy_raw if i == 0 else (accuracy_512 if i == 1 else accuracy_1024), 
             height=bar_width, label=setting, color=colors_third_figure[i])

axs[1, 0].set_xlabel('Accuracy (%)', fontsize=14)
axs[1, 0].set_ylabel('Experiment Setting', fontsize=14)
axs[1, 0].set_title('Accuracy of Various Max_token Lengths', fontsize=16)
axs[1, 0].set_yticks(x)
axs[1, 0].set_yticklabels(experiment_settings, fontsize=12)
axs[1, 0].tick_params(axis='x', labelsize=12)
axs[1, 0].legend(fontsize=10)
axs[1, 0].grid(True, linestyle='--', linewidth=0.5)

#----------------------------------------------------
# Figure 4: Accuracy of Different Prompt Settings at Various Temperatures
# background and grid
axs[0, 1].set_facecolor('lightgrey')
axs[0, 1].grid(True, linestyle='--', linewidth=1, color='white')
axs[0, 1].spines['top'].set_visible(False)
axs[0, 1].spines['right'].set_visible(False)

add_rectangle(axs[0, 1], 'b')

temperatures = ['Temperature = 0', 'Temperature = 0.1', 'Temperature = 0.4', 'Temperature = 0.6', 'Temperature = 0.9', 'Temperature = 1']
experiment_settings = ['best prompt', 'raw prompt']

accuracy_best_prompt = [45.56, 46.11, 45.55, 42.78, 45.0, 47.78]
accuracy_raw_prompt = [43.89, 45.56, 45.0, 42.78, 45.0, 48.33]

ci_best_prompt = [(44.47, 46.65), (43.93, 48.29), (42.67, 48.43), (38.85, 46.71), (38.46, 51.53), (44.9, 50.66)]
ci_raw_prompt = [(42.8, 44.98), (44.47, 46.65), (43.11, 46.89), (41.69, 43.86), (41.74, 48.27), (45.07, 51.6)]

x = np.arange(len(temperatures))

# Define colors for each line in the fourth figure
colors_fourth_figure = [gpt3_color, arctic]
marker_styles = ['o', '*']  # 'o' for circle, 's' for square

for i, setting in enumerate(experiment_settings):
    axs[0, 1].errorbar(accuracy_best_prompt if i == 0 else accuracy_raw_prompt, x, xerr=np.abs(np.array(ci_best_prompt if i == 0 else ci_raw_prompt).T - np.array(accuracy_best_prompt if i == 0 else accuracy_raw_prompt)), fmt=marker_styles[i], label=setting, capsize=5, capthick=2, color=colors_fourth_figure[i], alpha=0.85)

axs[0, 1].set_xlabel('Accuracy (%)', fontsize=14)
axs[0, 1].set_ylabel('Temperature', fontsize=14)
axs[0, 1].set_title('Accuracy of Various Temperatures', fontsize=16)
axs[0, 1].set_yticks(x)
axs[0, 1].set_yticklabels(temperatures, fontsize=12)
axs[0, 1].tick_params(axis='x', labelsize=12)
axs[0, 1].legend(fontsize=10)
axs[0, 1].grid(True, linestyle='--', linewidth=0.5)


# save plot
plt.savefig('Submit\\Figures\\E0.png', dpi=400, bbox_inches='tight')
plt.savefig('Submit\\Figures\\E0.jpg', dpi=400, bbox_inches='tight')
plt.savefig('Submit\\Figures\\E0.pdf', dpi=400, bbox_inches='tight')


# Show all subplots
plt.show()


### E0: Temperature and temp+seed

In [None]:
# tempreature: reading the result
tempreture_list= [0, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9, 1, 1.2, 1.4, 1.6, 1.8, 2]

tempreture_raw_dic={}
tempreture_best_dic={}

tempreture_raw_wseed_dic={}
tempreture_best_wseed_dic={}

for i in tempreture_list:
    tempreture_raw_dic[f"rawtemp_{i}"] = [
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-raw-temp{i}.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-raw-temp{i}-2.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-raw-temp{i}-3.xlsx",
    ]
    
    for path in tempreture_raw_dic[f"rawtemp_{i}"] :
        if not os.path.exists(path):
            print(f"This file doesn't exist: {path}")
    
    tempreture_best_dic[f"besttemp_{i}"] = [
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-temp{i}.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-temp{i}-2.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-temp{i}-3.xlsx"
    ]
    
    for path in tempreture_best_dic[f"besttemp_{i}"]  :
        if not os.path.exists(path):
            print(f"This file doesn't exist: {path}")

    tempreture_raw_wseed_dic[f"rawtemp_wseed_{i}"] = [
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-raw-wseed-temp{i}.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-raw-wseed-temp{i}-2.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-raw-wseed-temp{i}-3.xlsx",
    ]
    
    for path in tempreture_raw_wseed_dic[f"rawtemp_wseed_{i}"] :
        if not os.path.exists(path):
            print(f"This file doesn't exist: {path}")
    
    tempreture_best_wseed_dic[f"besttemp_wseed_{i}"] = [
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-wseed-temp{i}.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-wseed-temp{i}-2.xlsx",
        f"C:\\Users\\LEGION\\Documents\\GIT\\LLM_answer_GIBoard\\DO_NOT_PUBLISH\\ACG self asses\\E0P4-wseed-temp{i}-3.xlsx"
    ]
    
    for path in tempreture_best_wseed_dic[f"besttemp_wseed_{i}"]  :
        if not os.path.exists(path):
            print(f"This file doesn't exist: {path}")
            
    

# calculate average performances using best prompt
best_Average_Tables=[]
for experiment, experimentlist in tempreture_best_dic.items():
    table_performances=analyze_and_merge_multiple_files(experimentlist)
    table_average=average_performance_columns_with_stats(table_performances, experimentname=experiment)
    table_averageT=table_average.T
    table_averageF=table_averageT.drop('Model Name')
    best_Average_Tables.append(table_averageF)
    
allaveragetables_temp_best=pd.concat(best_Average_Tables)
allaveragetables_temp_best.columns=['GPT3 Performance (best)']
allaveragetables_temp_best


# calculate average performances using raw prompt
raw_Average_Tables=[]
for experiment, experimentlist in tempreture_raw_dic.items():
    table_performances=analyze_and_merge_multiple_files(experimentlist)
    table_average=average_performance_columns_with_stats(table_performances, experimentname=experiment)
    table_averageT=table_average.T
    table_averageF=table_averageT.drop('Model Name')
    raw_Average_Tables.append(table_averageF)

allaveragetables_temp_raw=pd.concat(raw_Average_Tables)
allaveragetables_temp_raw.columns=['GPT3 Performance (raw)']
allaveragetables_temp_raw

# calculate average performances using best prompt (wseed)
best_wseed_Average_Tables=[]
for experiment, experimentlist in tempreture_best_wseed_dic.items():
    table_performances=analyze_and_merge_multiple_files(experimentlist)
    table_average=average_performance_columns_with_stats(table_performances, experimentname=experiment)
    table_averageT=table_average.T
    table_averageF=table_averageT.drop('Model Name')
    best_wseed_Average_Tables.append(table_averageF)
    
allaveragetables_temp_best_wseed=pd.concat(best_wseed_Average_Tables)
allaveragetables_temp_best_wseed.columns=['GPT3 Performance (best)']
allaveragetables_temp_best_wseed


# calculate average performances using raw prompt (wseed)
raw_wseed_Average_Tables=[]
for experiment, experimentlist in tempreture_raw_wseed_dic.items():
    table_performances=analyze_and_merge_multiple_files(experimentlist)
    table_average=average_performance_columns_with_stats(table_performances, experimentname=experiment)
    table_averageT=table_average.T
    table_averageF=table_averageT.drop('Model Name')
    raw_wseed_Average_Tables.append(table_averageF)

allaveragetables_temp_raw_wseed=pd.concat(raw_wseed_Average_Tables)
allaveragetables_temp_raw_wseed.columns=['GPT3 Performance (raw)']
allaveragetables_temp_raw_wseed



In [None]:
pd.set_option('display.max_colwidth', None)
allaveragetables_temp_best_wseed
allaveragetables_temp_best
allaveragetables_temp_raw
allaveragetables_temp_raw_wseed

In [None]:
# temprature: best-woseed whisker for best
import re
import numpy as np

# Define regex patterns
pattern_average = r"(\d+\.\d+)\s*±"
pattern_std_dev = r"±(\d+\.\d+)\s*\["
pattern_upper_range = r"Range:\s*(\d+\.\d+)\s*,"
pattern_lower_range = r",\s*(\d+\.\d+)\]"

# Initialize lists to store extracted values
best_averages = []
best_std_devs = []
best_upper_ranges = []
best_lower_ranges = []
best_row_index = []
best_labels=[]

# Loop through the DataFrame and extract performance data
for idx, row in allaveragetables_temp_best.iterrows():
    performance_str = row['GPT3 Performance (best)']
    
    # Extract performance values using regex
    average = re.search(pattern_average, performance_str).group(1)
    std_dev = re.search(pattern_std_dev, performance_str).group(1)
    upper_range = re.search(pattern_upper_range, performance_str).group(1)
    lower_range = re.search(pattern_lower_range, performance_str).group(1)
    
    # Convert extracted values to float
    average = float(average)
    std_dev = float(std_dev)
    upper_range = float(upper_range)
    lower_range = float(lower_range)
    
    #create labels form row index
    prompt_type, temp = re.match(r'(best|raw)temp_([\d.]+)', idx).groups()
    # Modify row index names
    if prompt_type == 'best':
        prompt_type = 'best'
    elif prompt_type == 'raw':
        prompt_type = 'raw'
        
    label = f"{prompt_type} - temp {temp}"
    
    # Append to lists
    best_averages.append(average)
    best_std_devs.append(std_dev)
    best_upper_ranges.append(upper_range)
    best_lower_ranges.append(lower_range)
    
    # Append row index name to the list
    best_row_index.append(idx)
    best_labels.append(label)

# Output the statistics
print("Averages:", best_averages)
print("Standard Deviations:", best_std_devs)
print("Upper Ranges:", best_upper_ranges)
print("Lower Ranges:", best_lower_ranges)
print("Row Index Names:", best_row_index)
print("Labels:", best_labels)

if len(best_averages) == len(best_std_devs) == len(best_upper_ranges) == len(best_lower_ranges) == len(best_row_index) == len(best_labels):
    print(" ✓ ---> All lists have the same length.")
else:
    print(" ⚠ ---> Lists have different lengths.")
    
    
#---------------------------------------
import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(10, 6))

# Plot average line
plt.plot(best_labels, best_averages, marker='o', label='Average')

# Plot range (min to max)
plt.fill_between(best_labels, best_lower_ranges, best_upper_ranges, alpha=0.3, label='Range (Min to Max)')

# Customize plot
plt.xlabel('')
plt.ylabel('Performance %')
plt.title('Average-Min-Max Performance across Temperatures (BEST)')
plt.legend()

# Color
plt.grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
plt.gca().set_facecolor('lightgray')  # Set background color to gray
# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.xticks(rotation=20)
plt.tight_layout()

# Show plot
plt.show()

In [None]:
allaveragetables_temp_best_wseed

In [None]:
# temprature: best-wseed whisker for best
import re
import numpy as np

# Define regex patterns
pattern_average = r"(\d+\.\d+)\s*±"
pattern_std_dev = r"±(\d+\.\d+)\s*\["
pattern_upper_range = r"Range:\s*(\d+\.\d+)\s*,"
pattern_lower_range = r",\s*(\d+\.\d+)\]"

# Initialize lists to store extracted values
best_wseed_averages = []
best_wseed_std_devs = []
best_wseed_upper_ranges = []
best_wseed_lower_ranges = []
best_wseed_row_index = []
best_wseed_labels=[]

# Loop through the DataFrame and extract performance data
for idx, row in allaveragetables_temp_best_wseed.iterrows():
    performance_str = row['GPT3 Performance (best)']
    
    # Extract performance values using regex
    average = re.search(pattern_average, performance_str).group(1)
    std_dev = re.search(pattern_std_dev, performance_str).group(1)
    upper_range = re.search(pattern_upper_range, performance_str).group(1)
    lower_range = re.search(pattern_lower_range, performance_str).group(1)
    
    # Convert extracted values to float
    average = float(average)
    std_dev = float(std_dev)
    upper_range = float(upper_range)
    lower_range = float(lower_range)
    
    #create labels form row index
    prompt_type, temp = re.match(r'(best|raw)temp_wseed_([\d.]+)', idx).groups()
    # Modify row index names
    if prompt_type == 'best':
        prompt_type = 'best'
    elif prompt_type == 'raw':
        prompt_type = 'raw'
        
    label = f"{prompt_type} - temp {temp}"
    
    # Append to lists
    best_wseed_averages.append(average)
    best_wseed_std_devs.append(std_dev)
    best_wseed_upper_ranges.append(upper_range)
    best_wseed_lower_ranges.append(lower_range)
    
    # Append row index name to the list
    best_wseed_row_index.append(idx)
    best_wseed_labels.append(label)

# Output the statistics
print("Averages:", best_wseed_averages)
print("Standard Deviations:", best_wseed_std_devs)
print("Upper Ranges:", best_wseed_upper_ranges)
print("Lower Ranges:", best_wseed_lower_ranges)
print("Row Index Names:", best_wseed_row_index)
print("Labels:", best_wseed_labels)

if len(best_wseed_averages) == len(best_wseed_std_devs) == len(best_wseed_upper_ranges) == len(best_wseed_lower_ranges) == len(best_wseed_row_index) == len(best_wseed_labels):
    print(" ✓ ---> All lists have the same length.")
else:
    print(" ⚠ ---> Lists have different lengths.")
    
    
#---------------------------------------
import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(10, 6))

# Plot average line
plt.plot(best_wseed_labels, best_wseed_averages, marker='o', label='Average')

# Plot range (min to max)
plt.fill_between(best_wseed_labels, best_wseed_lower_ranges, best_wseed_upper_ranges, alpha=0.3, label='Range (Min to Max)')

# Customize plot
plt.xlabel('')
plt.ylabel('Performance %')
plt.title('Average-Min-Max Performance across Temperatures (BEST-wseed)')
plt.legend()

# Color
plt.grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
plt.gca().set_facecolor('lightgray')  # Set background color to gray
# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.xticks(rotation=20)
plt.tight_layout()

# Show plot
plt.show()

In [None]:
# temprature: raw-woseed average-min-max plot
import re
import numpy as np

# Define regex patterns
pattern_average = r"(\d+\.\d+)\s*±"
pattern_std_dev = r"±(\d+\.\d+)\s*\["
pattern_upper_range = r"Range:\s*(\d+\.\d+)\s*,"
pattern_lower_range = r",\s*(\d+\.\d+)\]"

# Initialize lists to store extracted values
raw_averages = []
raw_std_devs = []
raw_upper_ranges = []
raw_lower_ranges = []
raw_row_index = []
raw_labels=[]

# Loop through the DataFrame and extract performance data
for idx, row in allaveragetables_temp_raw.iterrows():
    performance_str = row['GPT3 Performance (raw)']
    
    # Extract performance values using regex
    average = re.search(pattern_average, performance_str).group(1)
    std_dev = re.search(pattern_std_dev, performance_str).group(1)
    upper_range = re.search(pattern_upper_range, performance_str).group(1)
    lower_range = re.search(pattern_lower_range, performance_str).group(1)
    
    # Convert extracted values to float
    average = float(average)
    std_dev = float(std_dev)
    upper_range = float(upper_range)
    lower_range = float(lower_range)
    
    #create labels form row index
    prompt_type, temp = re.match(r'(best|raw)temp_([\d.]+)', idx).groups()
    # Modify row index names
    if prompt_type == 'best':
        prompt_type = 'best'
    elif prompt_type == 'raw':
        prompt_type = 'raw'
        
    label = f"{prompt_type} - temp {temp}"
    
    # Append to lists
    raw_averages.append(average)
    raw_std_devs.append(std_dev)
    raw_upper_ranges.append(upper_range)
    raw_lower_ranges.append(lower_range)
    
    # Append row index name to the list
    raw_row_index.append(idx)
    raw_labels.append(label)

# Output the statistics
print("Averages:", raw_averages)
print("Standard Deviations:", raw_std_devs)
print("Upper Ranges:", raw_upper_ranges)
print("Lower Ranges:", raw_lower_ranges)
print("Row Index Names:", raw_row_index)
print("Labels: ", raw_labels)

if len(raw_averages) == len(raw_std_devs) == len(raw_upper_ranges) == len(raw_lower_ranges) == len(raw_row_index) == len(raw_labels):
    print(" ✓ ---> All lists have the same length.")
else:
    print(" ⚠ ---> Lists have different lengths.")



#---------------------------------------
import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(10, 6))

# Plot average line
plt.plot(raw_labels, raw_averages, marker='o', label='Average', color='red')

# Plot range (min to max)
plt.fill_between(raw_labels, raw_lower_ranges, raw_upper_ranges, alpha=0.3, label='Range (Min to Max)', color='red')

# Customize plot
plt.xlabel('')
plt.ylabel('Performance %')
plt.title('Average, Min, and Max Performance across Temperatures (RAW-woseed)')
plt.legend()
# Color
plt.grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
plt.gca().set_facecolor('lightgray')  # Set background color to gray
# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.xticks(rotation=20)
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# temprature: raw-wseed average-min-max plot
import re
import numpy as np

# Define regex patterns
pattern_average = r"(\d+\.\d+)\s*±"
pattern_std_dev = r"±(\d+\.\d+)\s*\["
pattern_upper_range = r"Range:\s*(\d+\.\d+)\s*,"
pattern_lower_range = r",\s*(\d+\.\d+)\]"

# Initialize lists to store extracted values
raw_wseed_averages = []
raw_wseed_std_devs = []
raw_wseed_upper_ranges = []
raw_wseed_lower_ranges = []
raw_wseed_row_index = []
raw_wseed_labels=[]

# Loop through the DataFrame and extract performance data
for idx, row in allaveragetables_temp_raw_wseed.iterrows():
    performance_str = row['GPT3 Performance (raw)']
    
    # Extract performance values using regex
    average = re.search(pattern_average, performance_str).group(1)
    std_dev = re.search(pattern_std_dev, performance_str).group(1)
    upper_range = re.search(pattern_upper_range, performance_str).group(1)
    lower_range = re.search(pattern_lower_range, performance_str).group(1)
    
    # Convert extracted values to float
    average = float(average)
    std_dev = float(std_dev)
    upper_range = float(upper_range)
    lower_range = float(lower_range)
    
    #create labels form row index
    prompt_type, temp = re.match(r'(best|raw)temp_wseed_([\d.]+)', idx).groups()
    # Modify row index names
    if prompt_type == 'best':
        prompt_type = 'best'
    elif prompt_type == 'raw':
        prompt_type = 'raw'
        
    label = f"{prompt_type} - temp {temp}"
    
    # Append to lists
    raw_wseed_averages.append(average)
    raw_wseed_std_devs.append(std_dev)
    raw_wseed_upper_ranges.append(upper_range)
    raw_wseed_lower_ranges.append(lower_range)
    
    # Append row index name to the list
    raw_wseed_row_index.append(idx)
    raw_wseed_labels.append(label)

# Output the statistics
print("Averages:", raw_wseed_averages)
print("Standard Deviations:", raw_wseed_std_devs)
print("Upper Ranges:", raw_wseed_upper_ranges)
print("Lower Ranges:", raw_wseed_lower_ranges)
print("Row Index Names:", raw_wseed_row_index)
print("Labels: ", raw_wseed_labels)

if len(raw_wseed_averages) == len(raw_wseed_std_devs) == len(raw_wseed_upper_ranges) == len(raw_wseed_lower_ranges) == len(raw_wseed_row_index) == len(raw_wseed_labels):
    print(" ✓ ---> All lists have the same length.")
else:
    print(" ⚠ ---> Lists have different lengths.")



#---------------------------------------
import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(10, 6))

# Plot average line
plt.plot(raw_wseed_labels, raw_wseed_averages, marker='o', label='Average', color='red')

# Plot range (min to max)
plt.fill_between(raw_wseed_labels, raw_wseed_lower_ranges, raw_wseed_upper_ranges, alpha=0.3, label='Range (Min to Max)', color='red')

# Customize plot
plt.xlabel('')
plt.ylabel('Performance %')
plt.title('Average, Min, and Max Performance across Temperatures (RAW wseed)')
plt.legend()
# Color
plt.grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
plt.gca().set_facecolor('lightgray')  # Set background color to gray
# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.xticks(rotation=20)
plt.tight_layout()

# Show plot
plt.show()


In [None]:
#mixing plots
import matplotlib.pyplot as plt

# Plotting
fig, axs = plt.subplots(2, 2, figsize=(14, 12), sharey=True)  # Create a figure with 1 row and 2 columns

# First plot (raw)------------------------------------
raw_woseed_color='#8ab2c0'
axs[0,0].plot(raw_labels, raw_averages, marker='o', label='Average', color=raw_woseed_color)
axs[0,0].fill_between(raw_labels, raw_lower_ranges, raw_upper_ranges, alpha=0.4, label='Range (Min to Max)', color=raw_woseed_color)
axs[0,0].set_xlabel('')
axs[0,0].set_ylabel('Accuracy %')
axs[0,0].set_title('Average-Min-Max Plot using Raw Prompt without Determining Seed')
axs[0,0].legend()
axs[0,0].grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
axs[0,0].set_facecolor('lightgray')  # Set background color to gray
axs[0,0].spines['top'].set_visible(False)  # Remove top spine
axs[0,0].spines['right'].set_visible(False)  # Remove right spine
axs[0,0].tick_params(axis='x', rotation=90)  # Rotate x-axis labels
# Highlight maximum value with a red border
max_index = raw_averages.index(max(raw_averages))
axs[0,0].plot(raw_labels[max_index], raw_averages[max_index], marker='o', markersize=8, markeredgecolor='red', markeredgewidth=1, color=raw_woseed_color)



# Second plot (best-woseed)----------------------------------
best_woseed_color='#00ace6'
axs[0,1].plot(best_labels, best_averages, marker='o', label='Average', color=best_woseed_color)
axs[0,1].fill_between(best_labels, best_lower_ranges, best_upper_ranges, alpha=0.3, label='Range (Min to Max)', color=best_woseed_color)
axs[0,1].set_xlabel('')
axs[0,1].set_title('Average-Min-Max Plot using Best Prompt without Determining Seed')
axs[0,1].legend()
axs[0,1].grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
axs[0,1].set_facecolor('lightgray')  # Set background color to gray
axs[0,1].spines['top'].set_visible(False)  # Remove top spine
axs[0,1].spines['right'].set_visible(False)  # Remove right spine
axs[0,1].tick_params(axis='x', rotation=90)  # Rotate x-axis labels
# Highlight maximum value with a red border
max_index = best_averages.index(max(best_averages))
axs[0,1].plot(best_labels[max_index], best_averages[max_index], marker='o', markersize=8, markeredgecolor='red', markeredgewidth=1,color=best_woseed_color)

# Third plot (raw-wseed)-------------------------------------
raw_wseed_color='#6BA17F'
axs[1,0].plot(raw_wseed_labels, raw_wseed_averages, marker='o', label='Average', color=raw_wseed_color)
axs[1,0].fill_between(raw_wseed_labels, raw_wseed_lower_ranges, raw_wseed_upper_ranges, alpha=0.4, label='Range (Min to Max)', color=raw_wseed_color)
axs[1,0].set_xlabel('')
axs[1,0].set_ylabel('Accuracy %')
axs[1,0].set_title('Average-Min-Max Plot using Raw Prompt with Determined Seed')
axs[1,0].legend()
axs[1,0].grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
axs[1,0].set_facecolor('lightgray')  # Set background color to gray
axs[1,0].spines['top'].set_visible(False)  # Remove top spine
axs[1,0].spines['right'].set_visible(False)  # Remove right spine
axs[1,0].tick_params(axis='x', rotation=90)  # Rotate x-axis labels
# Highlight maximum value with a red border
max_index = raw_wseed_averages.index(max(raw_wseed_averages))
axs[1,0].plot(raw_wseed_labels[max_index], raw_wseed_averages[max_index], marker='o', markersize=8, markeredgecolor='red', markeredgewidth=1, color=raw_wseed_color)



# Foruth plot (best-wseed)-------------------------------------------------
best_wseed_color='#20CE60'
axs[1,1].plot(best_wseed_labels, best_wseed_averages, marker='o', label='Average', color=best_wseed_color)
axs[1,1].fill_between(best_wseed_labels, best_wseed_lower_ranges, best_wseed_upper_ranges, alpha=0.3, label='Range (Min to Max)', color=best_wseed_color)
axs[1,1].set_xlabel('')
axs[1,1].set_title('Average-Min-Max Plot using Best Prompt with Determined Seed')
axs[1,1].legend()
axs[1,1].grid(True, color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
axs[1,1].set_facecolor('lightgray')  # Set background color to gray
axs[1,1].spines['top'].set_visible(False)  # Remove top spine
axs[1,1].spines['right'].set_visible(False)  # Remove right spine
axs[1,1].tick_params(axis='x', rotation=90)  # Rotate x-axis labels
# Highlight maximum value with a red border
max_index = best_wseed_averages.index(max(best_wseed_averages))
axs[1,1].plot(best_wseed_labels[max_index], best_wseed_averages[max_index], marker='o', markersize=8,color=best_wseed_color, markeredgecolor='red', markeredgewidth=1)


plt.tight_layout()  # Adjust layout to prevent overlap

# Save figure
fig_path=r'C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures\E0_Temp'
plt.savefig(f'{fig_path}.pdf', dpi=500, bbox_inches='tight')  # Save as PDF with higher resolution
#plt.savefig(f'{fig_path}.png', dpi=500, bbox_inches='tight')  # Save as PNG with higher resolution
#plt.savefig(f'{fig_path}.jpg', dpi=500, bbox_inches='tight')  # Save as JPG with higher resolution


# Show plot
plt.show()


### E1 raw

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt

def interactive_count_values(df, columns, replacement_dictionary=None):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The input df must be a pandas DataFrame.")
    if not all(isinstance(col, str) and col in df.columns for col in columns):
        raise ValueError("All elements of columns must be valid column names in the DataFrame.")
    if replacement_dictionary is not None and not isinstance(replacement_dictionary, dict):
        raise ValueError("replacement_dictionary must be a dictionary.")
    
    counts_dict = {}
    replacement_dict = replacement_dictionary if replacement_dictionary is not None else {}
    
    for col in columns:
        counts = df[col].value_counts(normalize=True) * 100
        print(f"Counts for column '{col}':")
        print(counts)
        
        replacements = {}
        for value, percentage in counts.items():
            if value in replacement_dict:
                replacements[value] = replacement_dict[value]
            else:
                replacement = input(f"Enter replacement for '{value}': ")
                replacement_dict[value] = replacement
                if replacement:  # Only add non-empty replacements
                    replacements[value] = replacement
                
        if replacements:
            df[col].replace(replacements)

        counts_dict[col] = df[col].value_counts(normalize=True) * 100
    
    return counts_dict, replacement_dict

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E1_Bestprompt-2022_all_Final_E2_manualevaluatedcompleted_Final_E2_Final.xlsx"
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)

# Identify the specific columns that we're interested in
correctness_columns = [col for col in df.columns if col.endswith('_correctness')]


replacement_dictionary={'Incorrect': 'Incorrect', 'incorrect': 'Incorrect', 'Correct': 'Correct', 'correct': 'Correct', 'EOP': 'EOP', '2OP': '2OP', 'NOP': 'NOP', 'No answer': 'NoA', 'NoA': 'NoA'}


# Call the function to interactively count values and handle replacements
model_correctness_counts, replacements = interactive_count_values(df, correctness_columns, replacement_dictionary=replacement_dictionary)

# Print the final counted data for each column
print("\nFinal counted data for each column:")
for col, counts in model_correctness_counts.items():
    print(f"Counts for column '{col}':")
    print(counts)

# Print the replacement dictionary
print("\nReplacement dictionary:")
print(replacements)


# Transform the dictionary into a DataFrame
df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)

# Plotting
ax = df_plot.plot(kind='bar', stacked=True, figsize=(14, 6))

ax.set_ylabel('Percent')
ax.set_xlabel('Models')
ax.set_title('Correctness Percentage Among Models')
plt.xticks(rotation=45)

plt.show()
    

In [None]:
label_dic={
    # Web
        #GPTs
    'GPT3.5Web_correctness':"GPT-3.5 Web",
    'gpt-3.5-turbo-0125_correctness': "GPT-3.5 API",
    'GPT4Web_correctness':"GPT-4 Web", 
    'gpt-4-0613_correctness':"GPT-4 API",
    'gpt-4o-2024-05-13_correctness': "GPT-4o API",
    'gpt-4o-mini-2024-07-18_correctness':"GPT-4omini API",           
    r" ": r" ",        
            
            
    #Claude
    'ClaudeHiakuWeb-raw_correctness':"Claude3-Hiaku Web",
    'claude-3-sonnet-20240229_correctness':"Claude3-Haiku API",  
    'ClaudeSonnetWeb-raw_correctness': "Claude3-Sonnet Web",
    'claude-3-haiku-20240307_correctness':"Claude3-Sonnet API", 
    'ClaudeOpuWeb-raw_correctness':"Claude3-Opus Web", 
    'claude-3-opus-20240229_correctness':"Claude3-Opus API",
    'claude-3-5-sonnet-20240620_correctness': "Claude3.5-Sonnet API",
    r"  ": r"  ",    
        
    
    # Poe
    'Mistral-7B-T_correctness':"Mistralv2-7b Poe",
    'mistral-instruct-v2-Q8_correctness':"Mistralv2-7b-Q8 Local",
    'MistralLarge-Poe_correctness':"Mistral-Large Poe",
    'Mixtral-8x7B-Poe_correctness':"Mixtral-8x7b Poe",
    
    r"     ": r"     ",
    'Llama27B-Poe_correctness': "Llama2-7b Poe", 
    'llama2-7B-Q8_correctness':"Llama2-7b-Q8 Local",
    'medicine-chat-Q8_correctness':"medicineLLM-7b-Q8 Local",
    'Llama-2-13b_correctness': 'Llama2-13b Poe',
    'llama2-13B-Q5KM_correctness':"Llama2-13b-Q5 Local", 
    'Llama270B-Poe_correctness':"Llama2-70b Poe",
    'Llama3-8b-Poe_correctness':'Llama3-8b Poe',
    "llama3-8b-Q8_correctness": "Llama3-8b-Q8 Local", 
    'openbioLLM-7B-Q8_correctness': "OpenBioLLM-8b-Q8 Local", 
    'Llama3-70b-Poe_correctness':'Llama3-70b Poe',
    "Llama3.1-8B_correctness": "Llama3.1-8b Poe",
    "Llama3.1-70B_correctness": "Llama3.1-70b Poe",
    "Llama3.1-405B_correctness": "Llama3.1-405b Poe",
    
    r"       ": r"       ",
    
        # Gemini
    'GeminiWeb-raw_correctness':"Gemini Web",
    'GeminiAdvancedWeb-raw_correctness':"GeminiAdvanced Web", 
    'gemma2-9b-it_correctness': "Gemma2-9b Poe",
    'Gemma2-9b-Q8_correctness': "Gemma2-9b-Q8 Local",
    "gemma2-27b-it_correctness": "Gemma2-27b Poe",
    r"    ": r"    ",

    'phi3-3b-Q16_correctness':'Phi3-3b-Q16 Local',
    'Phi3-medium14b-Q6_correctness':'Phi3-14b-Q6 Local',
    }


replacement_dictionary = {'Incorrect': 'Incorrect', 'incorrect': 'Incorrect', 'Correct': 'Correct', 'correct': 'Correct', 'EOP': 'EOP', '2OP': '2OP', 'NOP': 'NOP', 'No answer': 'Error', 'NoA': 'Error'}
color_map= {
        "Correct": "#008000",  # green
        "2OP": "#90EE90",  # light green
        "EOP": "#D3D3D3",  # light grey
        "NOP": "#A9A9A9",  # grey
        "Error": "#ffa500",
        "Incorrect": "#FF0000"  # red
    }

excel_file_path = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E1_Bestprompt-2022_all_Final_E2_manualevaluatedcompleted_Final_E2_Final.xlsx"

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def interactive_count_values(df, columns,replacement_dictionary=None):
    # Dictionary to store counts of unique values for each column
    counts_dict = {}
    
    # Dictionary to store replacements to avoid asking again
    if replacement_dictionary is None:
        replacement_dict = {}
    else: 
        replacement_dict=replacement_dictionary
    
    for col in columns:
        # Calculate counts of unique values for this column
        counts = df[col].value_counts(normalize=True) * 100  # Convert to percentages
        
        # Display counts and ask for replacements
        #print(f"Counts for column '{col}':")
        #print(counts)
        for value, percentage in counts.items():
            # Check if a replacement is already stored
            if value in replacement_dict:
                replacement = replacement_dict[value]
            else:
                replacement = input(f"Enter replacement for '{value}': ")
                # Store replacement to avoid asking again
                replacement_dict[value] = replacement
                
            # Replace value if needed
            if replacement != "":
                df[col] = df[col].replace({value: replacement})
        
        # Store counts for this column in the dictionary
        counts_dict[col] = df[col].value_counts(normalize=True) * 100  # Update counts after replacements
    
    return counts_dict, replacement_dict

def draw_line(ax, pos, length, text):
    line_y = data.max() + 10  # slightly above the highest bar
    ax.annotate(
        text, 
        xy=(pos, line_y + 5), 
        xytext=(0, 0), textcoords="offset points",
        ha='center', va='bottom'
    )
    ax.plot([pos, pos + length], [line_y, line_y], color='black')

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = excel_file_path
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)

# Identify the specific columns that we're interested in
correctness_columns = [col for col in df.columns if col.endswith('_correctness')]

replacement_dictionary=replacement_dictionary



# Call the function to interactively count values and handle replacements
model_correctness_counts, replacements = interactive_count_values(df, correctness_columns, replacement_dictionary=replacement_dictionary)

# Print the final counted data for each column
#print("\nFinal counted data for each column:")
#for col, counts in model_correctness_counts.items():
#    print(f"Counts for column '{col}':")
#    print(counts)

# Print the replacement dictionary
print("\nReplacement dictionary:")
print(replacements)


# Transform the dictionary into a DataFrame
df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)

color_map = color_map
# Reordering columns to match color map
desired_columns = [key for key in color_map]
valid_columns = [col for col in desired_columns if col in df_plot.columns]
missing_columns = [col for col in desired_columns if col not in df_plot.columns]
for col in missing_columns:
    df_plot[col] = 0
df_plot = df_plot[desired_columns]


# labeling and sorting indexes (models) -------------------------------
label_dic=label_dic

invalid_indexes_in_desireddic = [key for key, value in label_dic.items() if key not in df_plot.index]
print(f"WARNING ! These labels you have in your label_dic are not valid in the df.index Edit the label or remove it if you added this by mistake\n {invalid_indexes_in_desireddic}")
invalid_indexes_in_originaldata = [index for index in list(df_plot.index) if index not in list(label_dic.keys())]
print(f"WARNING ! These indexes in the dataframe where missed from your label_dic. Add a corresponding label to label_dic \n {invalid_indexes_in_originaldata}")

#renmae models and reorder them
df_plot.rename(index=label_dic, inplace=True)
new_order = list(label_dic.values())  
df_plot = df_plot.reindex(new_order)



# Plotting the stacked bar chart
bottom = None

# Plotting
plt.figure(figsize=(12, 6))

bottom = None
for key in color_map:
    values = df_plot[key]
    plt.bar(df_plot.index, values, bottom=bottom, color=color_map[key], label=key)
    if key == 'Correct':
        for i, value in enumerate(values):
            if value>0:
                plt.text(i, value -12, f"{value:.1f}%", ha='center', va='bottom',
                        fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                        bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5))
                
            # if i in [1]:
            #     plt.text(i-0.25, 103,"Web Environment", va='bottom',
            #              fontdict={'fontname': 'Arial', 'fontsize': 14}, color='black',
            #              bbox=dict(facecolor='grey', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5)
            #              )
            # elif i in [9]:
            #     plt.text(i-0.5, 103,"Poe Environment", va='bottom',
            #              fontdict={'fontname': 'Arial', 'fontsize': 14}, color='black',
            #              bbox=dict(facecolor='grey', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5)
            #              )
            # elif i in [15]:
            #     plt.text(i, 103,"API Environment", va='bottom',
            #              fontdict={'fontname': 'Arial', 'fontsize': 14}, color='black',
            #              bbox=dict(facecolor='grey', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5)
            #              )
            # elif i in [22]:
            #     plt.text(i+0.25, 103,"Local Environment", va='bottom',
            #              fontdict={'fontname': 'Arial', 'fontsize': 14}, color='black',
            #              bbox=dict(facecolor='grey', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5)
            #              )
    if bottom is None:
        bottom = values
    else:
        bottom = bottom + values

# Customizing the plot

#plt.xlabel('Models')
plt.ylabel('Percentage')
plt.title(f'All 2022 ACG questions (N = 300)', fontdict={'fontname': 'Arial', 'fontsize': 15}, )
plt.legend()

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right', size=10)
# Place legend outside the plot area
plt.legend(loc='upper right',)

# Color
plt.grid(True, axis='y', color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
plt.gca().set_facecolor('lightgray')  # Set background color to gray
# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Show plot
plt.tight_layout()
plt.savefig(r"Submit\Figures\E1_Performance.jpg", dpi=400)
plt.savefig(r"Submit\Figures\E1_Performance.png", dpi=400)
plt.savefig(r"Submit\Figures\E1_Performance.pdf", dpi=400)
plt.savefig(r"Submit\Figures\E1_Performance_imagevstext-total.jpg", dpi=400)
plt.savefig(r"Submit\Figures\E1_Performance_imagevstext-total.png", dpi=400)
plt.savefig(r"Submit\Figures\E1_Performance_imagevstext-total.pdf", dpi=400)
plt.show()


## E1- horizental

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

def interactive_count_values(df, columns, replacement_dictionary=None):
    counts_dict = {}
    replacement_dict = replacement_dictionary or {}
    
    for col in columns:
        counts = df[col].value_counts(normalize=True) * 100
        for value, percentage in counts.items():
            if value not in replacement_dict:
                replacement = input(f"Enter replacement for '{value}': ")
                replacement_dict[value] = replacement
            
            if replacement_dict[value] != "":
                df[col] = df[col].replace({value: replacement_dict[value]})
        
        counts_dict[col] = df[col].value_counts(normalize=True) * 100
    
    return counts_dict, replacement_dict

def draw_line(ax, pos, length, text):
    line_x = data.max() + 10
    ax.annotate(
        text, 
        xy=(line_x + 5, pos), 
        xytext=(0, 0), textcoords="offset points",
        ha='left', va='center'
    )
    ax.plot([line_x, line_x + length], [pos, pos], color='black')

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = excel_file_path
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)

correctness_columns = [col for col in df.columns if col.endswith('_correctness')]

replacement_dictionary = replacement_dictionary

model_correctness_counts, replacements = interactive_count_values(df, correctness_columns, replacement_dictionary=replacement_dictionary)

print("\nReplacement dictionary:")
print(replacements)

df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)

color_map = color_map
desired_columns = [key for key in color_map]
valid_columns = [col for col in desired_columns if col in df_plot.columns]
missing_columns = [col for col in desired_columns if col not in df_plot.columns]
for col in missing_columns:
    df_plot[col] = 0
df_plot = df_plot[desired_columns]

label_dic = label_dic

invalid_indexes_in_desireddic = [key for key, value in label_dic.items() if key not in df_plot.index]
print(f"WARNING ! These labels you have in your label_dic are not valid in the df.index Edit the label or remove it if you added this by mistake\n {invalid_indexes_in_desireddic}")
invalid_indexes_in_originaldata = [index for index in list(df_plot.index) if index not in list(label_dic.keys())]
print(f"WARNING ! These indexes in the dataframe where missed from your label_dic. Add a corresponding label to label_dic \n {invalid_indexes_in_originaldata}")

df_plot.rename(index=label_dic, inplace=True)
new_order = list(label_dic.values())  
new_order_reversed = new_order[::-1]  # Reverse the order
df_plot = df_plot.reindex(new_order_reversed)

plt.figure(figsize=(8.27, 11.69))

left = None
for key in color_map:
    values = df_plot[key]
    plt.barh(df_plot.index, values, left=left, color=color_map[key], label=key)
    if key == 'Correct':
        for i, value in enumerate(values):
            if value > 0:
                plt.text(value -10, i, f"{value:.1f}%", ha='left', va='center',
                        fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                        bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3', alpha=0.5))
    if left is None:
        left = values
    else:
        left = left + values

plt.xlabel('Percentage')
plt.title(f'All 2022 ACG questions (N = 300)', fontdict={'fontname': 'Arial', 'fontsize': 15})
plt.legend(loc='lower left')

plt.yticks(size=10)
plt.grid(True, axis='x', color='white', linestyle='-', linewidth=0.5)
plt.gca().set_facecolor('lightgray')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig(r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Final Figures\E1_Performance_horizontal.jpg", dpi=400)
plt.savefig(r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Final Figures\E1_Performance_horizontal.png", dpi=400)
plt.savefig(r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Final Figures\E1_Performance_horizontal.pdf", dpi=400)
plt.show()

### STRATIFIED

#### E1-stratified-ImageText

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = excel_file_path
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)

# List of unique question types
question_types = df['Question Type'].unique()


# Loop through each question type
for question_type in question_types:
    # Filter the DataFrame for the current question type
    df_filtered = df[df['Question Type'] == question_type]
    
    # Calculate correctness columns for the filtered DataFrame
    correctness_columns = [col for col in df.columns if col.endswith('_correctness')]
    
    # Dictionary to store counts of unique values for each model
    replacement_dictionary=replacement_dictionary

    # Call the function to interactively count values and handle replacements
    model_correctness_counts, replacements = interactive_count_values(df_filtered, correctness_columns, replacement_dictionary=replacement_dictionary)

    
    # Transform the dictionary into a DataFrame
    df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)
    
    # labeling -------------------------------
    
    #color_map was previously
    color_map = color_map
    # Reordering columns to match color map
    desired_columns = [key for key in color_map]
    valid_columns = [col for col in desired_columns if col in df_plot.columns]
    missing_columns = [col for col in desired_columns if col not in df_plot.columns]
    for col in missing_columns:
        df_plot[col] = 0
    df_plot = df_plot[desired_columns]


    # labeling and sorting indexes (models) -------------------------------
    label_dic=label_dic

    #invalid_indexes_in_desireddic = [key for key, value in label_dic.items() if key not in df_plot.index]
    #print(f"WARNING ! These labels you have in your label_dic are not valid in the df.index Edit the label or remove it if you added this by mistake\n {invalid_indexes_in_desireddic}")
    #invalid_indexes_in_originaldata = [index for index in list(df_plot.index) if index not in list(label_dic.keys())]
    #print(f"WARNING ! These indexes in the dataframe where missed from your label_dic. Add a corresponding label to label_dic \n {invalid_indexes_in_originaldata}")

    #renmae models and reorder them
    df_plot.rename(index=label_dic, inplace=True)
    new_order = list(label_dic.values())  
    df_plot = df_plot.reindex(new_order)



    # Plotting the stacked bar chart
    bottom = None

    # Plotting
    plt.figure(figsize=(12, 6))

    bottom = None
    for key in color_map:
        values = df_plot[key]
        plt.bar(df_plot.index, values, bottom=bottom, color=color_map[key], label=key)
        if key == 'Correct':
            for i, value in enumerate(values):
                if value>0:
                    plt.text(i, value -12, f"{value:.1f}%", ha='center', va='bottom',
                            fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                            bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5))
                    
        if bottom is None:
            bottom = values
        else:
            bottom = bottom + values

    # Customizing the plot
    plt.ylabel('Percentage')
    plt.title(f'Question type: {question_type} (N = {df_filtered.shape[0]})',  fontdict={'fontname': 'Arial', 'fontsize': 15}, )
    plt.legend()
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=55, ha='right')
    # Place legend outside the plot area
    plt.legend(loc='upper right')
    
    # Color
    plt.grid(True, axis='y', color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
    plt.gca().set_facecolor('lightgray')  # Set background color to gray
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Show plot
    plt.tight_layout()
    plt.savefig(f"Submit\\Figures\\E1_Performance_imagevstext-{question_type}.jpg", dpi=400,bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_imagevstext-{question_type}.png", dpi=400,bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_imagevstext-{question_type}.pdf", dpi=400,bbox_inches='tight')
    plt.show()
    

# stacked image
directory = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures"
prefix = r'E1_Performance_imagevstext-'
num_columns=1

give_list_instead=[
    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures\E1_Performance_imagevstext-total.png",
    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures\E1_Performance_imagevstext-text-based.png",
    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures\E1_Performance_imagevstext-image-inclusive.png"
]
add_subplot_tags_and_stack(directory, prefix,num_columns,give_list_instead=give_list_instead)

#import shutil
#Final_save_dic={
#    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Final Figures\E1_Performance_imagevstext-__stacked_image.png":
#        r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figure1.png",
#    r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Final Figures\E1_Performance_imagevstext-__stacked_image.pdf":
#        r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figure1.pdf",
#}
#for key, value in Final_save_dic.items():
#    shutil.copy(src=key,dst=value)



#### E1-stratified-subcategory

In [None]:
startified_colomn_name='Qtype Category'
import pandas as pd
import matplotlib.pyplot as plt

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = excel_file_path
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# List of unique question types
question_types = df[startified_colomn_name].unique()


# Loop through each question type
for question_type in question_types:
    # Filter the DataFrame for the current question type
    df_filtered = df[df[startified_colomn_name] == question_type]
    
    # Calculate correctness columns for the filtered DataFrame
    correctness_columns = [col for col in df.columns if col.endswith('_correctness')]
    
    # Dictionary to store counts of unique values for each model
    replacement_dictionary=replacement_dictionary
    # Call the function to interactively count values and handle replacements
    model_correctness_counts, replacements = interactive_count_values(df_filtered, correctness_columns, replacement_dictionary=replacement_dictionary)

    
    # Transform the dictionary into a DataFrame
    df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)
    
    # labeling -------------------------------
    
    #color_map was previously
    color_map = color_map
    # Reordering columns to match color map
    desired_columns = [key for key in color_map]
    valid_columns = [col for col in desired_columns if col in df_plot.columns]
    missing_columns = [col for col in desired_columns if col not in df_plot.columns]
    for col in missing_columns:
        df_plot[col] = 0
    df_plot = df_plot[desired_columns]


    # labeling and sorting indexes (models) -------------------------------
    label_dic=label_dic

    invalid_indexes_in_desireddic = [key for key, value in label_dic.items() if key not in df_plot.index]
    print(f"WARNING ! These labels you have in your label_dic are not valid in the df.index Edit the label or remove it if you added this by mistake\n {invalid_indexes_in_desireddic}")
    invalid_indexes_in_originaldata = [index for index in list(df_plot.index) if index not in list(label_dic.keys())]
    print(f"WARNING ! These indexes in the dataframe where missed from your label_dic. Add a corresponding label to label_dic \n {invalid_indexes_in_originaldata}")

    #renmae models and reorder them
    df_plot.rename(index=label_dic, inplace=True)
    new_order = list(label_dic.values())  
    df_plot = df_plot.reindex(new_order)



    # Plotting the stacked bar chart
    bottom = None

    # Plotting
    plt.figure(figsize=(12, 6))

    bottom = None
    for key in color_map:
        values = df_plot[key]
        plt.bar(df_plot.index, values, bottom=bottom, color=color_map[key], label=key)
        if key == 'Correct':
            for i, value in enumerate(values):
                if value>0:
                    plt.text(i, value -12, f"{value:.1f}%", ha='center', va='bottom',
                            fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                            bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5))

        if bottom is None:
            bottom = values
        else:
            bottom = bottom + values

    # Customizing the plot
    #plt.xlabel('Models')
    plt.ylabel('Percentage')
    plt.title(f'Question type: {question_type} (N = {df_filtered.shape[0]})', fontdict={'fontname': 'Arial', 'fontsize': 15}, )
    plt.legend()
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=55, ha='right')
    # Place legend outside the plot area
    plt.legend(loc='upper right')
    
    # Color
    plt.grid(True, axis='y', color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
    plt.gca().set_facecolor('lightgray')  # Set background color to gray
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Show plot
    plt.tight_layout()
    plt.savefig(f"Submit\\Figures\\E1_Performance_subcategory-{question_type}.jpg", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_subcategory-{question_type}.png", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_subcategory-{question_type}.pdf", dpi=400, bbox_inches='tight')
    plt.show()

# stacked image
directory = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures"
prefix = 'E1_Performance_subcategory-'
num_columns=2
add_subplot_tags_and_stack(directory, prefix,num_columns)




In [None]:
# remove the extra pancreato biliary
directory = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures"
prefix = 'E1_Performance_subcategory-'
num_columns=2
add_subplot_tags_and_stack(directory, prefix,num_columns)

#### E1-stratified-Length

In [None]:

startified_colomn_name='Qtype Length'
import pandas as pd
import matplotlib.pyplot as plt

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E1_Bestprompt-2022_all_Final_E2_manualevaluatedcompleted_Final_E2_Final.xlsx"
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# List of unique question types
question_types = df[startified_colomn_name].unique()


# Loop through each question type
for question_type in question_types:
    # Filter the DataFrame for the current question type
    df_filtered = df[df[startified_colomn_name] == question_type]
    
    # Calculate correctness columns for the filtered DataFrame
    correctness_columns = [col for col in df.columns if col.endswith('_correctness')]
    
    # Dictionary to store counts of unique values for each model
    replacement_dictionary=replacement_dictionary

    # Call the function to interactively count values and handle replacements
    model_correctness_counts, replacements = interactive_count_values(df_filtered, correctness_columns, replacement_dictionary=replacement_dictionary)

    
    # Transform the dictionary into a DataFrame
    df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)
    
    # labeling -------------------------------
    
    #color_map was previously
    color_map = color_map
    # Reordering columns to match color map
    desired_columns = [key for key in color_map]
    valid_columns = [col for col in desired_columns if col in df_plot.columns]
    missing_columns = [col for col in desired_columns if col not in df_plot.columns]
    for col in missing_columns:
        df_plot[col] = 0
    df_plot = df_plot[desired_columns]


    # labeling and sorting indexes (models) -------------------------------
    # labeling and sorting indexes (models) -------------------------------
    # label_dic=label_dic

    # invalid_indexes_in_desireddic = [key for key, value in label_dic.items() if key not in df_plot.index]
    # print(f"WARNING ! These labels you have in your label_dic are not valid in the df.index Edit the label or remove it if you added this by mistake\n {invalid_indexes_in_desireddic}")
    # invalid_indexes_in_originaldata = [index for index in list(df_plot.index) if index not in list(label_dic.keys())]
    # print(f"WARNING ! These indexes in the dataframe where missed from your label_dic. Add a corresponding label to label_dic \n {invalid_indexes_in_originaldata}")

    #renmae models and reorder them
    df_plot.rename(index=label_dic, inplace=True)
    new_order = list(label_dic.values())  
    df_plot = df_plot.reindex(new_order)



    # Plotting the stacked bar chart
    bottom = None

    # Plotting
    plt.figure(figsize=(12, 6))

    bottom = None
    for key in color_map:
        values = df_plot[key]
        plt.bar(df_plot.index, values, bottom=bottom, color=color_map[key], label=key)
        if key == 'Correct':
            for i, value in enumerate(values):
                if value>0:
                    plt.text(i, value -12, f"{value:.1f}%", ha='center', va='bottom',
                            fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                            bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5))

        if bottom is None:
            bottom = values
        else:
            bottom = bottom + values

    # Customizing the plot
    #plt.xlabel('Models')
    plt.ylabel('Percentage')
    plt.title(f'Question type: {question_type} (N = {df_filtered.shape[0]})', fontdict={'fontname': 'Arial', 'fontsize': 15}, )
    plt.legend()
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=55, ha='right')
    # Place legend outside the plot area
    plt.legend(loc='upper right', )
    
    # Color
    plt.grid(True, axis='y', color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
    plt.gca().set_facecolor('lightgray')  # Set background color to gray
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Show plot
    #plt.tight_layout()
    plt.savefig(f"Submit\\Figures\\E1_Performance_Qlength-{question_type}.jpg", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_Qlength-{question_type}.png", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_Qlength-{question_type}.pdf", dpi=400, bbox_inches='tight')
    plt.show()



In [None]:
# stacked image
directory = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures"
prefix = 'E1_Performance_Qlength-'
num_columns=1
add_subplot_tags_and_stack(directory, prefix,num_columns)

#### E1-stratified-Difficulty

In [None]:

startified_colomn_name='Qtype Difficulty'
import pandas as pd
import matplotlib.pyplot as plt

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = excel_file_path
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# List of unique question types
question_types = df[startified_colomn_name].unique()


# Loop through each question type
for question_type in ["Q1", "Q2", "Q3", "Q4"]:
    # Filter the DataFrame for the current question type
    df_filtered = df[df[startified_colomn_name] == question_type]
    
    # Calculate correctness columns for the filtered DataFrame
    correctness_columns = [col for col in df.columns if col.endswith('_correctness')]
    
    # Dictionary to store counts of unique values for each model
    replacement_dictionary= replacement_dictionary

    # Call the function to interactively count values and handle replacements
    model_correctness_counts, replacements = interactive_count_values(df_filtered, correctness_columns, replacement_dictionary=replacement_dictionary)

    
    # Transform the dictionary into a DataFrame
    df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)
    
    # labeling -------------------------------
    
    #color_map was previously
    color_map = color_map
    # Reordering columns to match color map
    desired_columns = [key for key in color_map]
    valid_columns = [col for col in desired_columns if col in df_plot.columns]
    missing_columns = [col for col in desired_columns if col not in df_plot.columns]
    for col in missing_columns:
        df_plot[col] = 0
    df_plot = df_plot[desired_columns]


    # labeling and sorting indexes (models) -------------------------------
    label_dic=label_dic

    invalid_indexes_in_desireddic = [key for key, value in label_dic.items() if key not in df_plot.index]
    print(f"WARNING ! These labels you have in your label_dic are not valid in the df.index Edit the label or remove it if you added this by mistake\n {invalid_indexes_in_desireddic}")
    invalid_indexes_in_originaldata = [index for index in list(df_plot.index) if index not in list(label_dic.keys())]
    print(f"WARNING ! These indexes in the dataframe where missed from your label_dic. Add a corresponding label to label_dic \n {invalid_indexes_in_originaldata}")

    #renmae models and reorder them
    df_plot.rename(index=label_dic, inplace=True)
    new_order = list(label_dic.values())  
    df_plot = df_plot.reindex(new_order)



    # Plotting the stacked bar chart
    bottom = None

    # Plotting
    plt.figure(figsize=(12, 6))

    bottom = None
    for key in color_map:
        values = df_plot[key]
        plt.bar(df_plot.index, values, bottom=bottom, color=color_map[key], label=key)
        if key == 'Correct':
            for i, value in enumerate(values):
                if value>0:
                    plt.text(i, value -12, f"{value:.1f}%", ha='center', va='bottom',
                            fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                            bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5))
                    
        if bottom is None:
            bottom = values
        else:
            bottom = bottom + values

    # Customizing the plot
    #plt.xlabel('Models')
    plt.ylabel('Percentage')
    plt.title(f'Question Difficulty (Average Human Score): {question_type} (N = {df_filtered.shape[0]})',  fontdict={'fontname': 'Arial', 'fontsize': 15}, )
    plt.legend()
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=55, ha='right')
    # Place legend outside the plot area
    plt.legend(loc='upper right',)
    
    # Color
    plt.grid(True, axis='y', color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
    plt.gca().set_facecolor('lightgray')  # Set background color to gray
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Show plot
    #plt.tight_layout()
    plt.savefig(f"Submit\\Figures\\E1_Performance_Qdifficulty-{question_type}.jpg", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_Qdifficulty-{question_type}.png", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_Qdifficulty-{question_type}.pdf", dpi=400, bbox_inches='tight')
    plt.show()



In [None]:
# stacked image
directory = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures"
prefix = 'E1_Performance_Qdifficulty-'
num_columns=1
add_subplot_tags_and_stack(directory, prefix,num_columns)

#### E1-startified- Patient care phase

In [None]:

patient_care_dict={
    "Diagnosis" : df[df["Qtype Care Phase - Diagsis"] == "Yes"],
    "Treatment" : df[df["Qtype Care Phase - Treatment"] == "Yes"],
    "Complication" : df[df["Qtype Care Phase - Complications"] == "Yes"],
    "Investigation" : df[df["Qtype Care Phase - Medical Investigation"] == "Yes"],
    "Pathophisiology" : df[df["Qtype Care Phase - Pathophyisiology"] == "Yes"],
}

import pandas as pd
import matplotlib.pyplot as plt

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E1_Bestprompt-2022_all_Final_E2_manualevaluatedcompleted_Final_E2_Final.xlsx"
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)




# Loop through each question type
for question_type, df_filtered in patient_care_dict.items():

    
    # Calculate correctness columns for the filtered DataFrame
    correctness_columns = [col for col in df.columns if col.endswith('_correctness')]
    
    # Dictionary to store counts of unique values for each model
    replacement_dictionary=replacement_dictionary

    # Call the function to interactively count values and handle replacements
    model_correctness_counts, replacements = interactive_count_values(df_filtered, correctness_columns, replacement_dictionary=replacement_dictionary)

    
    # Transform the dictionary into a DataFrame
    df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)
    
    # labeling -------------------------------
    
    #color_map was previously
    color_map = color_map
    # Reordering columns to match color map
    desired_columns = [key for key in color_map]
    valid_columns = [col for col in desired_columns if col in df_plot.columns]
    missing_columns = [col for col in desired_columns if col not in df_plot.columns]
    for col in missing_columns:
        df_plot[col] = 0
    df_plot = df_plot[desired_columns]


    # labeling and sorting indexes (models) -------------------------------
    label_dic=label_dic

    invalid_indexes_in_desireddic = [key for key, value in label_dic.items() if key not in df_plot.index]
    print(f"WARNING ! These labels you have in your label_dic are not valid in the df.index Edit the label or remove it if you added this by mistake\n {invalid_indexes_in_desireddic}")
    invalid_indexes_in_originaldata = [index for index in list(df_plot.index) if index not in list(label_dic.keys())]
    print(f"WARNING ! These indexes in the dataframe where missed from your label_dic. Add a corresponding label to label_dic \n {invalid_indexes_in_originaldata}")

    #renmae models and reorder them
    df_plot.rename(index=label_dic, inplace=True)
    new_order = list(label_dic.values())  
    df_plot = df_plot.reindex(new_order)



    # Plotting the stacked bar chart
    bottom = None

    # Plotting
    plt.figure(figsize=(12, 6))

    bottom = None
    for key in color_map:
        values = df_plot[key]
        plt.bar(df_plot.index, values, bottom=bottom, color=color_map[key], label=key)
        if key == 'Correct':
            for i, value in enumerate(values):
                if value>0:
                    plt.text(i, value -12, f"{value:.1f}%", ha='center', va='bottom',
                            fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                            bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3',alpha=0.5))

        if bottom is None:
            bottom = values
        else:
            bottom = bottom + values

    # Customizing the plot
    #plt.xlabel('Models')
    plt.ylabel('Percentage')
    plt.title(f'Question type: {question_type} (N = {df_filtered.shape[0]})', fontdict={'fontname': 'Arial', 'fontsize': 15}, )
    plt.legend()
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=55, ha='right')
    # Place legend outside the plot area
    plt.legend(loc='upper right')
    
    # Color
    plt.grid(True, axis='y', color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
    plt.gca().set_facecolor('lightgray')  # Set background color to gray
    # Remove top and right spines
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Show plot
    #plt.tight_layout()
    plt.savefig(f"Submit\\Figures\\E1_Performance_Ptphase-{question_type}.jpg", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_Ptphase-{question_type}.png", dpi=400, bbox_inches='tight')
    plt.savefig(f"Submit\\Figures\\E1_Performance_Ptphase-{question_type}.pdf", dpi=400, bbox_inches='tight')
    plt.show()

# stacked image
directory = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures"
prefix = 'E1_Performance_Ptphase-'

num_columns=1
add_subplot_tags_and_stack(directory, prefix,num_columns)

## E1 heatmap

In [None]:
label_dic={
    # Web
        #GPTs
    'GPT3.5Web_correctness':"GPT-3.5 Web",
    'gpt-3.5-turbo-0125_correctness': "GPT-3.5 API",
    'GPT4Web_correctness':"GPT-4 Web", 
    'gpt-4-0613_correctness':"GPT-4 API",
    'gpt-4o-2024-05-13_correctness': "GPT-4o API",
    'gpt-4o-mini-2024-07-18_correctness':"GPT-4omini API",           
    r" ": r" ",        
            
            
    #Claude
    'ClaudeHiakuWeb-raw_correctness':"Claude3-Hiaku Web",
    'claude-3-sonnet-20240229_correctness':"Claude3-Haiku API",  
    'ClaudeSonnetWeb-raw_correctness': "Claude3-Sonnet Web",
    'claude-3-haiku-20240307_correctness':"Claude3-Sonnet API", 
    'ClaudeOpuWeb-raw_correctness':"Claude3-Opus Web", 
    'claude-3-opus-20240229_correctness':"Claude3-Opus API",
    'claude-3-5-sonnet-20240620_correctness': "Claude3.5-Sonnet API",
    r"  ": r"  ",    
        
    
    # Poe
    'Mistral-7B-T_correctness':"Mistralv2-7b Poe",
    'mistral-instruct-v2-Q8_correctness':"Mistralv2-7b-Q8 Local",
    'MistralLarge-Poe_correctness':"Mistral-Large Poe",
    'Mixtral-8x7B-Poe_correctness':"Mixtral-8x7b Poe",
    
    r"     ": r"     ",
    'Llama27B-Poe_correctness': "Llama2-7b Poe", 
    'llama2-7B-Q8_correctness':"Llama2-7b-Q8 Local",
    'medicine-chat-Q8_correctness':"medicineLLM-7b-Q8 Local",
    'Llama-2-13b_correctness': 'Llama2-13b Poe',
    'llama2-13B-Q5KM_correctness':"Llama2-13b-Q5 Local", 
    'Llama270B-Poe_correctness':"Llama2-70b Poe",
    'Llama3-8b-Poe_correctness':'Llama3-8b Poe',
    "llama3-8b-Q8_correctness": "Llama3-8b-Q8 Local", 
    'openbioLLM-7B-Q8_correctness': "OpenBioLLM-8b-Q8 Local", 
    'Llama3-70b-Poe_correctness':'Llama3-70b Poe',
    "Llama3.1-8B_correctness": "Llama3.1-8b Poe",
    "Llama3.1-70B_correctness": "Llama3.1-70b Poe",
    "Llama3.1-405B_correctness": "Llama3.1-405b Poe",
    
    r"       ": r"       ",
    
        # Gemini
    'GeminiWeb-raw_correctness':"Gemini Web",
    'GeminiAdvancedWeb-raw_correctness':"GeminiAdvanced Web", 
    'gemma2-9b-it_correctness': "Gemma2-9b Poe",
    'Gemma2-9b-Q8_correctness': "Gemma2-9b-Q8 Local",
    "gemma2-27b-it_correctness": "Gemma2-27b Poe",
    r"    ": r"    ",

    'phi3-3b-Q16_correctness':'Phi3-3b-Q16 Local',
    'Phi3-medium14b-Q6_correctness':'Phi3-14b-Q6 Local',

    r"                       ": r"                   ",     
    'Qtype Difficulty':'Difficulty Quartiles',
    r"            ": r"           ",
    'Qtype Length': 'Length Tertiles',
    r"      ": r"      ",
    'Qtype Taxonomy - Integrated': 'Taxonomy: Integrated',
    r"         ": r"          ",
    
    'Qtype Care Phase - Diagsis': 'Diagnosis',
    'Qtype Care Phase - Treatment': 'Treatment',
    'Qtype Care Phase - Medical Investigation': 'Investigation',
    'Qtype Care Phase - Complications': 'Complications',
    r"          ": r"          ",
    
    'Category - Liver': 'Liver',
    'Category - COLON': 'Colon',
    'Category - Esophagus': 'Esophagus',
    'Category - IBD': 'IBD',
    'Category - Pancreatico-biliary': 'Pancreatico-biliary',
    
    }


replacement_dictionary = {'Incorrect': 'Incorrect', 'incorrect': 'Incorrect', 'Correct': 'Correct', 'correct': 'Correct', 'EOP': 'EOP', '2OP': '2OP', 'NOP': 'NOP', 'No answer': 'Error', 'NoA': 'Error'}
color_map= {
        "Correct": "#008000",  # green
        "2OP": "#90EE90",  # light green
        "EOP": "#D3D3D3",  # light grey
        "NOP": "#A9A9A9",  # grey
        "Error": "#ffa500",
        "Incorrect": "#FF0000",  # red
        
        "Short": "#DDFFF9",
        "Medium": "#9DE5FF" ,
        "Long": "#647AFF",
            
        'Q1': "#FF7AF4",
        'Q2': "#F58AFF",
        'Q3': "#D5BDFF",
        'Q4': "#E2E3FF",
        
        'Yes': "#6c757d",
        'No': "#ffffff",
        
    }

excel_file_path = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E1_Bestprompt-2022_all_Final_E2_manualevaluatedcompleted_Final_E2_Final.xlsx"

In [None]:
import pandas as pd
import re

def rename_and_reorder_columns(excel_file, label_dic):
    # Load the Excel file into a DataFrame
    df = pd.read_excel(excel_file)
    
    # Rename the columns based on the dictionary
    df.rename(columns=label_dic, inplace=True)
    
    # Add dummy columns for spacing in label_dic where needed
    modified_columns = []
    for key in label_dic:
        if key.strip():  # Only add actual model names
            modified_columns.append(label_dic[key])
        else:
            # Create a unique dummy name for empty spaces
            dummy_name = ' ' * (modified_columns.count(' ') + 1)
            df[dummy_name] = float('nan')  # Add NaN column for spacing
            modified_columns.append(dummy_name)
    
    # Reorder the columns including dummy columns
    df = df[modified_columns]
    
    return df

def clean_data(df):
    # Remove 'Resolved: ' prefix from string entries in the DataFrame
    df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)

    # Strip whitespace from all string entries in the DataFrame
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    
    return df

def replace_values(df, replacement_dictionary):
    # Replace specific values according to the replacement dictionary
    df.replace(replacement_dictionary, inplace=True)
    return df

# Call the function to rename and reorder columns
new_df = rename_and_reorder_columns(excel_file_path, label_dic)

# Clean data in the DataFrame
cleaned_df = clean_data(new_df)

# Replace values in the DataFrame
final_df = replace_values(cleaned_df, replacement_dictionary)

# Counting "Correct" in each row
final_df['CorrectCount'] = final_df.apply(lambda row: (row == 'Correct').sum(), axis=1)

# Sorting the dataframe based on the count of "Correct"
final_df_sorted = final_df.sort_values(by='CorrectCount', ascending=False)

# Optionally, you can drop the helper column if it's no longer needed
final_df_sorted = final_df_sorted.drop(columns=['CorrectCount'])
final_df_sorted

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


def draw_custom_heatmap(df, color_map):
    # Create a figure and a subplot
    fig, ax = plt.subplots(figsize=(11, 15))  # Adjust size to fit the number of columns or use dynamic sizing
    
    # Number of models (rows) and metrics (columns)
    num_models = len(df)
    num_metrics = len(df.columns)
    
    # Set the ticks for x and y axes
    ax.set_xticks(range(num_metrics))
    ax.set_yticks(range(num_models))
    ax.set_xticklabels(df.columns, rotation=90)
    ax.set_yticklabels(df.index)
    
    # Draw each cell as a colored rectangle
    for i, (idx, row) in enumerate(df.iterrows()):
        for j, val in enumerate(row):
            color = color_map.get(val, 'white')  # Default color is white if not specified in color_map
            ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=True, color=color))

    # Adjust axis limits to make sure all rows and columns are included
    ax.set_xlim(0, num_metrics)
    ax.set_ylim(0, num_models)

    # Hide the grid and frame
    ax.grid(color='white')
    # Customize spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.xaxis.set_visible(False)

    # This is to ensure the rows are displayed in the order as in the dataframe
    ax.invert_yaxis()
    
    plt.savefig("C:\\Users\\LEGION\\Documents\GIT\\LLM_answer_GIBoard\Submit\\Final Figures\\heatmap.jpg", dpi=400, bbox_inches='tight')
    plt.savefig("C:\\Users\\LEGION\\Documents\GIT\\LLM_answer_GIBoard\Submit\\Final Figures\\heatmap.png", dpi=400, bbox_inches='tight')
    plt.savefig("C:\\Users\\LEGION\\Documents\GIT\\LLM_answer_GIBoard\Submit\\Final Figures\\heatmap.pdf", dpi=400, bbox_inches='tight')
    # Show the plot
    plt.show()


# Draw the heatmap
draw_custom_heatmap(final_df_sorted.T, color_map)

## E2

In [None]:
replacement_dictionary = {'Incorrect': 'Incorrect', 'incorrect': 'Incorrect', 'Correct': 'Correct', 'correct': 'Correct', 'EOP': 'EOP', '2OP': '2OP', 'NOP': 'NOP', 'No answer': 'Error', 'NoA': 'Error'}
color_map= {
        "Correct": "#008000",  # green
        "2OP": "#90EE90",  # light green
        "EOP": "#D3D3D3",  # light grey
        "NOP": "#A9A9A9",  # grey
        "Error": "#ffa500",
        "Incorrect": "#FF0000",  # red       
    }

# labeling and sorting indexes (models) -------------------------------
label_dic_GPT4Web={
    'GPT4Web_correctness':"GPT4 Web - no image", 
    'GPT4Web_GPT4VWebCapQ_correctness':'GPT4 Web - LLM caption',
    'GPT4VWeb_Direct_correctness':'GPT4 Web - direct image',
    'GPT4Web_HumanCapQ_correctness':'GPT4 Web - human hint',}

label_dic_ClaudeOpusWeb={
    'ClaudeOpuWeb-raw_correctness':"Claude3Opus Web - no image",
    'Claude3OpusWeb_Claude3OpusWebCapQ_correctness': "Claude3Opus Web - LLM caption",
    'Claude3OpusVWeb_Direct_correctness':  "Claude3Opus Web - direct image",
    'Claude3OpusWeb_HumanCapQ_correctness': "Claude3Opus Web - human hint",
    }

label_dic_GeminiAdvWeb={
    'GeminiAdvancedWeb-raw_correctness':"GeminiAdvanced Web - no image",
    'GeminiAdvanced_GeminiAdvnacedVWebQ_correctness': "GeminiAdvanced Web - LLM caption",
    'GeminiAdvnacedVWeb_Direct_correctness':  "GeminiAdvanced Web- direct image",
    'GeminiAdvancedWeb_HumanCapQ_correctness':"GeminiAdvanced Web - human hint",}


label_dic_GPT4API={
    'gpt-4-0613_correctness':"GPT4 API - no image", 
    'gpt-4-0613-APIwLLMca_correctness': 'GPT4 API - LLM caption',
    'gpt-4-vision-preview-APIDirect_correctness':'GPT4 API - direct image',
    'gpt-4-0613-APIwHumanHint_correctness':'GPT4 API - human hint',}

label_dic_ClaudeOpusAPI={
    'claude-3-opus-20240229_correctness':"Claude3Opus API - no image", 
    'claude-3-opus-20240229-APIwLLMca_correctness':'Claude3Opus API - LLM caption',
    'claude-3-opus-20240229-APIwDirect_correctness':'Claude3Opus API - direct image',
    'claude-3-opus-20240229-APIwHumanHint_correctness':'Claude3Opus API - human hint'
    }

label_dic_ClaudeSonnetAPI={
    'claude-3-sonnet-20240229_correctness':"Claude3Sonnet API - no image", 
    'claude-3-sonnet-20240229-APIwLLMca_correctness':'Claude3Sonnet API - LLM caption',
    'claude-3-sonnet-20240229-APIwDirect_correctness':'Claude3Sonnet API - direct image',
    'claude-3-sonnet-20240229-APIwHumanHint_correctness':'Claude3Sonnet API - human hint'
    }

list_of_label_dics = [label_dic_GPT4Web, label_dic_GPT4API, label_dic_ClaudeOpusWeb, label_dic_ClaudeOpusAPI, label_dic_ClaudeSonnetAPI, label_dic_GeminiAdvWeb ]

# label_dic_GPT35_nodirect={
#     'GPT3.5Web_correctness':"GPT3.5 - (Web) no image", 
#     'gpt-3.5-turbo-0125_correctness': "GPT3.5 - (API) no image", 
#     'GPT3.5Web_GPT4VWebCapQ_correctness': 'GPT3.5 - LLM caption',
#     'GPT3.5VWeb_Direct_correctness':'GPT3.5 - direct image',
#     'GPT3.5Web_HumanCapQ_correctness': 'GPT3.5 - human hint',}
    
# label_dic_remainingcol={
#     #Claude
#         'ClaudeSonnetWeb-raw_correctness': "Claude3-Sonnet-Web", 
#     # Gemini
#            'GeminiWeb-raw_correctness':"Gemini-Web", 
#     #claudeAPI
#     'ClaudeHiakuWeb-raw_correctness':"Claude3Hiaku - (Web) no image", 
#     'claude-3-sonnet-20240229_correctness':"Claude3Hiaku - (API) no image",
#     'Claude3HaikuWeb_Claude3OpusWebCapQ_correctness':"Claude3Hiaku - LLM caption",
#     'claude-3-haiku-20240307_correctness':"Claude3-Sonnet-API",}


excel_file_path = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E2-2022-imageQ-CompleteCaptions_GenerationComplete_prepared_evaluated_Final_wAPIs.xlsx"

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def interactive_count_values(df, columns,replacement_dictionary=None):
    # Dictionary to store counts of unique values for each column
    counts_dict = {}
    
    # Dictionary to store replacements to avoid asking again
    if replacement_dictionary is None:
        replacement_dict = {}
    else: 
        replacement_dict=replacement_dictionary
    
    for col in columns:
        # Calculate counts of unique values for this column
        counts = df[col].value_counts(normalize=True) * 100  # Convert to percentages
        
        # Display counts and ask for replacements
        #print(f"Counts for column '{col}':")
        #print(counts)
        for value, percentage in counts.items():
            # Check if a replacement is already stored
            if value in replacement_dict:
                replacement = replacement_dict[value]
            else:
                replacement = input(f"Enter replacement for '{value}': ")
                # Store replacement to avoid asking again
                replacement_dict[value] = replacement
                
            # Replace value if needed
            if replacement != "":
                df[col] = df[col].replace({value: replacement})
        
        # Store counts for this column in the dictionary
        counts_dict[col] = df[col].value_counts(normalize=True) * 100  # Update counts after replacements
    
    return counts_dict, replacement_dict

def draw_line(ax, pos, length, text):
    line_y = data.max() + 10  # slightly above the highest bar
    ax.annotate(
        text, 
        xy=(pos, line_y + 5), 
        xytext=(0, 0), textcoords="offset points",
        ha='center', va='bottom'
    )
    ax.plot([pos, pos + length], [line_y, line_y], color='black')

# Load data from Excel file (replace 'excel_file_path' with your file path)
excel_file_path = excel_file_path
df = pd.read_excel(excel_file_path)
df = df.apply(lambda x: re.sub('^Resolved: ', '', x) if isinstance(x, str) else x)

# Identify the specific columns that we're intere sted in
correctness_columns = [col for col in df.columns if col.endswith('_correctness')]

replacement_dictionary=replacement_dictionary



# Call the function to interactively count values and handle replacements
model_correctness_counts, replacements = interactive_count_values(df, correctness_columns, replacement_dictionary=replacement_dictionary)

# Print the final counted data for each column
#print("\nFinal counted data for each column:")
#for col, counts in model_correctness_counts.items():
#    print(f"Counts for column '{col}':")
#    print(counts)

# Print the replacement dictionary
print("\nReplacement dictionary:")
print(replacements)


# Transform the dictionary into a DataFrame
df_plot = pd.DataFrame(model_correctness_counts).T.fillna(0)

color_map = color_map
# Reordering columns to match color map
desired_columns = [key for key in color_map]
valid_columns = [col for col in desired_columns if col in df_plot.columns]
missing_columns = [col for col in desired_columns if col not in df_plot.columns]
for col in missing_columns:
    df_plot[col] = 0
df_plot = df_plot[desired_columns]


# Show plot
plt.tight_layout()
#plt.savefig(r"Submit\Figures\E1_Performance_horizental.jpg", dpi=400)
#plt.savefig(r"Submit\Figures\E1_Performance_horizental.png", dpi=400)
#plt.savefig(r"Submit\Figures\E1_Performance_horizental.pdf", dpi=400)
plt.show()
df_plot


In [None]:
list_of_stat_labels=[]
for label_dic in list_of_label_dics:
    n=0
    stat_dic={}
    for key in label_dic.keys():
        if n !=0:
            stat_dic[key_refrence].append(key)
        if n==0:
            key_refrence = key
            stat_dic[key_refrence] = []
        n+=1
    list_of_stat_labels.append(stat_dic)

list_of_stat_labels

In [None]:
# first calculate the stat and then go to plot. You should use this df for future use after plotting
list_of_stat_labels=[]
for label_dic in list_of_label_dics:
    n=0
    stat_dic={}
    for key in label_dic.keys():
        if n !=0:
            stat_dic[key_refrence].append(key)
        if n==0:
            key_refrence = key
            stat_dic[key_refrence] = []
        n+=1
    list_of_stat_labels.append(stat_dic)

df_counts = df_plot.copy()
df_counts['Correct'] = (df_counts['Correct'] * 3).round().astype(int)
df_counts['Incorrect'] = (df_counts['Incorrect'] * 3).round().astype(int)

# Function to perform chi-square test
def chi_square_test(df_counts, reference, comparison):
    observed = [
        [df_counts.at[reference, 'Correct'], df_counts.at[reference, 'Incorrect']],
        [df_counts.at[comparison, 'Correct'], df_counts.at[comparison, 'Incorrect']]
    ]
    chi2, p, dof, ex = chi2_contingency(observed)
    
    return p

def add_stat_annotation(p_val):
    if p_val < 0.0001:
        annotation = '****'
    elif p_val < 0.001:
        annotation = '***'
    elif p_val < 0.01:
        annotation = '**'
    elif p_val < 0.05:
        annotation = '*'
    elif p_val >=0.05:
        annotation = 'ns'
    else:
        annotation = ' '
    return annotation
        
        
df_counts['p_value'] = None
df_counts['p_value_annotation'] = None
for comparison_dict in list_of_stat_labels:
    for reference, comparisons in comparison_dict.items():
        for comparison in comparisons:
            p_value = chi_square_test(df_counts, reference, comparison)
            df_counts.at[comparison, 'p_value'] = p_value
            df_counts.at[comparison, 'p_value_annotation']= add_stat_annotation(p_value)
            df_counts.at[reference, 'p_value_annotation'] =r'(ref)'
for label_dic in list_of_label_dics:
    for old_name, new_name in label_dic.items():
        df_counts.rename(index={old_name: new_name}, inplace=True)
df_counts      


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
df_plot2=df_plot.copy()

# Rename and reorder the dataframe based on list_of_label_dics
ordered_indices = []
for label_dic in list_of_label_dics:
    for old_name, new_name in label_dic.items():
        df_plot2.rename(index={old_name: new_name}, inplace=True)
        ordered_indices.append(new_name)

# Add NaN rows to create empty bars
extended_indices = []
for i, dic in enumerate(list_of_label_dics):
    for key in dic.values():
        extended_indices.append(key)
    if i < len(list_of_label_dics) - 1:  # Add NaN between clusters except after the last one
        extended_indices.append(f'empty_{i}')
    

df_extended = df_plot2.reindex(extended_indices).fillna(0)

# Reversing the order of indices for plotting
df_extended = df_extended.iloc[::-1]

# Plotting the stacked bar chart
plt.figure(figsize=(8, 12))

bottom = None
for key in color_map:
    values = df_extended[key]
    plt.barh(df_extended.index, values, left=bottom, color=color_map[key], label=key)
    if key == 'Correct':
        for i, value in enumerate(values):
            if value > 0:
                plt.text((value if bottom is None else bottom[i] + value) - (12 if key == 'Correct' else -5), i, 
                         f"{value:.1f}%", va='center', ha='left',
                         fontdict={'fontname': 'Arial', 'fontsize': 8}, color='black',
                         bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.3', alpha=0.5))


    if bottom is None:
        bottom = values
    else:
        bottom += values

for model in df_counts.index.to_list():
    if model in df_extended.index:
        plt.text(107, model, df_counts.at[model, 'p_value_annotation'])



plt.grid(True, axis='x', color='white', linestyle='-', linewidth=0.5) 
plt.gca().set_facecolor('lightgray') 
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.legend(loc='lower right',)

# Adding labels, title and legend
plt.xlabel('Percentage')
plt.yticks(rotation=0)

# Hide y-axis labels for specified empty indices
ax = plt.gca()
y_labels = [item.get_text() for item in ax.get_yticklabels()]
y_labels = [label if 'empty' not in label else '' for label in y_labels]
ax.set_yticklabels(y_labels)

plt.tight_layout()
plt.savefig(r"Submit\Figures\E2_performance_horizental.jpg", dpi=400)
plt.savefig(r"Submit\Figures\E2_performance_horizental.png", dpi=400)
plt.savefig(r"Submit\Figures\E2_performance_horizental.pdf", dpi=400)

plt.show()

## E3

In [None]:
plt.rcParams.update({'font.size': 15})
import pandas as pd
import matplotlib.pyplot as plt

# Sample dataframe creation - replace this with your actual dataframe
E3_excel_file_path=r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E3b_GPT.xlsx"
df = pd.read_excel(E3_excel_file_path)


# Find columns ending with '_correctness'
correctness_columns = [col for col in df.columns if col.endswith('_correctness')]

# Calculate accuracy for each correctness column
accuracies = {}
for col in correctness_columns:
    accuracy = (df[col] == 'correct').sum() / len(df)
    accuracies[col] = accuracy

# User input for labels and years
labels_years = { 
                'babbage-002_correctness': ('GPT3: babbage-002', '2021-09','#AF8260'), 
                'davinci-002_correctness': ('GPT3: davinci-002', '2021-09', '#AF8260'), 
                'gpt-3.5-turbo-instruct_correctness': ('GPT3.5-Instruct', '2021-09', 'orange'), 
                'gpt-3.5-turbo-0125_correctness': ('GPT3.5-0125', '2023-12','orange'), 
                'gpt-4-0613_correctness': ('GPT4-0613', '2021-09', 'blue'), 
                'gpt-3.5-turbo-0613_correctness': ('GPT3.5-0613', '2021-09','orange'), 
                'gpt-3.5-turbo-1106_correctness': ('GPT3.5-1106', '2023-04','orange'), 
                'gpt-4-1106-preview_correctness': ('GPT4-1106', '2023-04','blue'), 
                'gpt-4-0125-preview_correctness': ('GPT4-0125', '2023-12','blue'), 
                'gpt-4o-2024-05-13_correctness':('GPT4o-20240513', '2023-09','#640D6B'),
                'gpt-4o-mini-2024-07-18_correctness':('GPT4omini-20240718', '2023-09','#640D6B')}

for col in accuracies:

    if col not in labels_years:
        label = input(f"Enter a label for {col}: ")
        year = input(f"Enter the year for {col} (extracted from column name): ")
        color=year = input(f"Enter the color for {col}: ")
        labels_years[col] = (label, year,color)
    print(f"Column: {col}, Accuracy: {accuracies[col]*100:.2f}% -> {labels_years[col]} ")

print(labels_years)

# Plotting the data
plt.figure(figsize=(8, 6))
# Color
plt.grid(True, axis='y', color='white', linestyle='-', linewidth=0.5)  # Change grid color to white
plt.gca().set_facecolor('lightgray')  # Set background color to gray
# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)



for col, accuracy in accuracies.items():
    label, year, color = labels_years[col]
    plt.scatter(year, accuracy * 100, label=f"{label} ({year})", s=100, color= color)  # Size of point set to 100 for visibility



plt.xlabel('Model Training Data Cutoff Date')
plt.ylabel('Percentage of Correct Answers')
#plt.title('Accuracy of GPT Models Over Years')
plt.legend(title="Model (Training Data Date)")

save_path=r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\Submit\Figures\newE3b"
plt.savefig(f'{save_path}.pdf', dpi=500, bbox_inches='tight')  # Save as PDF with higher resolution
plt.savefig(f'{save_path}.png', dpi=500, bbox_inches='tight')  # Save as PNG with higher resolution
plt.savefig(f'{save_path}.jpg', dpi=500, bbox_inches='tight')  # Save as JPG with higher resolution


plt.show()

## Extra: Validation of Semi-Automated Extraction

### GPT3.5 Evaluation Sample

In [None]:
Extraction_column='Extracted Answer Clean for confusion matrix'
Human_Validation_column='Huamn Extracted Answer'

validation_excel_path=r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\GPT3.5Extractor Validation.xlsx"

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_excel(validation_excel_path)

# Drop missing values to ensure clean data for analysis
df = df.dropna(subset=[Human_Validation_column, Extraction_column])

# Generate a cross-tabulation of counts between the two columns
confusion_data = pd.crosstab(df[Human_Validation_column], df[Extraction_column])

# Reorder the rows and columns based on specified sort orders
confusion_data = confusion_data.reindex(index=true_labels_sort_order, columns=extracted_labels_sort_order)

# Any missing rows or columns after reindexing will be filled with zeros
confusion_data = confusion_data.fillna(0).astype(int)

# Plotting the confusion data as a heatmap
plt.figure(figsize=(8, 6))
ax = sns.heatmap(confusion_data, annot=True, fmt='d', cmap='Blues')
plt.xlabel('GPT Extracted Option from LLM Textual Response')
plt.ylabel('Blinded Human Validation')

# Move x-axis to top and rotate specific labels
ax.xaxis.tick_top()  # Move x-axis to top
ax.xaxis.set_label_position('top')  # Move the x-axis label to top

# Rotate all x and y tick labels for clarity
plt.xticks(rotation=45, ha='left')  # Horizontal alignment can be adjusted
plt.yticks(rotation=45)
plt.tight_layout()
plt.savefig(r"Submit\Figures\GPT_Extract_Validation.jpg", dpi=400)
plt.savefig(r"Submit\Figures\GPT_Extract_Validation.png", dpi=400)
plt.savefig(r"Submit\Figures\GPT_Extract_Validation.pdf", dpi=400)

plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

# Load the dataset
df = pd.read_excel(validation_excel_path)

# Drop missing values to ensure clean data for analysis
df = df.dropna(subset=[Human_Validation_column, Extraction_column])

# Generate a cross-tabulation of counts between the two columns
confusion_data = pd.crosstab(df[Human_Validation_column], df[Extraction_column])

# Reorder the rows and columns based on specified sort orders
confusion_data = confusion_data.reindex(index=true_labels_sort_order, columns=extracted_labels_sort_order)

# Any missing rows or columns after reindexing will be filled with zeros
confusion_data = confusion_data.fillna(0).astype(int)

# Plotting the confusion data as a heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(confusion_data, annot=True, fmt='d', cmap='Blues')
plt.xlabel('GPT-3.5 Extracted Option from LLM Textual Response',  fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.ylabel('Blinded Human Validation',  fontdict={'fontsize': 14, 'fontweight': 'bold'})

# Move x-axis to top and rotate specific labels
ax.xaxis.tick_top()  # Move x-axis to top
ax.xaxis.set_label_position('top')  # Move the x-axis label to top

# Rotate all x and y tick labels for clarity
plt.xticks(rotation=20, ha='left')  # Horizontal alignment can be adjusted
plt.yticks(rotation=20)

# Define labels to highlight
highlight_labels = ['A', 'B', 'C', 'D', 'E']

# Draw rectangles around specified labels on both axes
for label in ax.get_xticklabels():
    if label.get_text() in highlight_labels:
        # Find the position of the label in the list
        index = extracted_labels_sort_order.index(label.get_text())
        # Draw a rectangle around the column
        ax.add_patch(Rectangle((index, 0), 1, 5, fill=False, edgecolor='#ff0080', lw=1, alpha=0.3))


for label in ax.get_yticklabels():
    if label.get_text() in highlight_labels:
        # Find the position of the label in the list
        index = true_labels_sort_order.index(label.get_text())
        # Draw a rectangle around the row
        ax.add_patch(Rectangle((0, index), 5, 1, fill=False, edgecolor='#ff0080', lw=1, alpha=0.3))



plt.show()


### Count Manually Evaluated answers

In [None]:
E1_excel_file_path= r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\__Milestone Datasets\E1_Bestprompt-2022_all_Final_E2_manualevaluatedcompleted_Final_E2_Final.xlsx"


import pandas as pd

import pandas as pd

def count_and_percent_non_standard_answers(excel_file_path, specific_columns=None):
    # Read the Excel file
    df = pd.read_excel(excel_file_path)
    
    # Determine which columns to analyze
    if specific_columns is not None:
        answer_columns = [col for col in specific_columns if col in df.columns and col.endswith('_answer')]
    else:
        answer_columns = [col for col in df.columns if col.endswith('_answer')]
    
    # Initialize a dictionary to store the counts and percentages
    results = {}
    overall_total_count = 0
    overall_non_standard_count = 0
    
    # Valid answers set
    valid_answers = {'A', 'B', 'C', 'D', 'E'}
    
    # Calculate counts and percentages for each column
    for col in answer_columns:
        total_count = 300  # Count of non-null answers
        non_standard_count = df[col][~df[col].isin(valid_answers)].count()
        percentage = (non_standard_count / total_count) * 100 if total_count > 0 else 0
        
        # Update overall totals
        overall_total_count += total_count
        overall_non_standard_count += non_standard_count
        
        results[col] = {'Total Count': total_count, 'Non-Standard Count': non_standard_count, 'Percentage': percentage}
    
    # Calculate overall percentages
    overall_percentage = (overall_non_standard_count / overall_total_count) * 100 if overall_total_count > 0 else 0
    
    # Store overall results
    results['Overall'] = {'Total Count': overall_total_count, 'Non-Standard Count': overall_non_standard_count, 'Percentage': overall_percentage}
    
    return results

# Example usage:
# excel_file_path = 'path_to_your_excel_file.xlsx'
# specific_columns = ['math_answer', 'science_answer']  # Optional: specify columns
# print(count_and_percent_non_standard_answers(excel_file_path, specific_columns))


results = count_and_percent_non_standard_answers(E1_excel_file_path)
print(results)

# Example usage:
# excel_file_path = 'path_to_your_excel_file.xlsx'
# print(count_non_standard_answers(excel_file_path))

In [None]:
# df = pd.read_excel(E1_excel_file_path)
# answer_columns = [col for col in df.columns if col.endswith('_answer')]


desired_columns_list = ['gpt-3.5-turbo-0125_answer',
    'gpt-4-0613_answer',
    'gpt-4o-2024-05-13_answer',
    'gpt-4o-mini-2024-07-18_answer',

    'claude-3-5-sonnet-20240620_answer',
    'claude-3-opus-20240229_answer',
    'claude-3-haiku-20240307_answer',
    'claude-3-sonnet-20240229_answer',]

results = count_and_percent_non_standard_answers(E1_excel_file_path, desired_columns_list)
print(results)

## Extra: Image Resoulution

In [None]:
import os
from PIL import Image
import numpy as np

def get_image_stats(directory):
    dimensions = []
    resolutions = []

    for filename in os.listdir(directory):
        if filename.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
            filepath = os.path.join(directory, filename)
            with Image.open(filepath) as img:
                width, height = img.size
                dpi = img.info.get('dpi', (0, 0))
                
                dimensions.append((width, height))
                resolutions.append(dpi)

    if dimensions:
        widths, heights = zip(*dimensions)
        avg_width = np.mean(widths)
        avg_height = np.mean(heights)
        std_width = np.std(widths)
        std_height = np.std(heights)
    else:
        avg_width = avg_height = std_width = std_height = 0

    if resolutions:
        x_dpi, y_dpi = zip(*resolutions)
        avg_x_dpi = np.mean(x_dpi)
        avg_y_dpi = np.mean(y_dpi)
        std_x_dpi = np.std(x_dpi)
        std_y_dpi = np.std(y_dpi)
    else:
        avg_x_dpi = avg_y_dpi = std_x_dpi = std_y_dpi = 0

    return {
        'average_dimensions': (avg_width, avg_height),
        'std_dimensions': (std_width, std_height),
        'average_resolution': (avg_x_dpi, avg_y_dpi),
        'std_resolution': (std_x_dpi, std_y_dpi)
    }

# Example usage
directory_path = r"C:\Users\LEGION\Documents\GIT\LLM_answer_GIBoard\DO_NOT_PUBLISH\ACG self asses\2022"
stats = get_image_stats(directory_path)
print(stats)