In [None]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

with open("log_prob_score_qwen.json", "r") as f:
    qwen = json.load(f)

with open("log_prob_score_codellama.json", "r") as f:
    codellama = json.load(f)

In [None]:
def create_boxplot_no_outliers(dataset1, dataset2, dataset1_name='Dataset 1', dataset2_name='Dataset 2'):
    # Create dataframes for each dataset
    data1 = {'Score': [], 'Category': [], 'Dataset': []}
    data2 = {'Score': [], 'Category': [], 'Dataset': []}
    
    # Process dataset1
    for key, values in dataset1.items():
        if 'least' in values:
            data1['Score'].append(values['least'])
            data1['Category'].append('Best')
            data1['Dataset'].append(dataset1_name)
        if 'more' in values:
            data1['Score'].append(values['more'])
            data1['Category'].append('Medium')
            data1['Dataset'].append(dataset1_name)
        if 'most' in values:
            data1['Score'].append(values['most'])
            data1['Category'].append('Worst')
            data1['Dataset'].append(dataset1_name)
    
    # Process dataset2
    for key, values in dataset2.items():
        if 'least' in values:
            data2['Score'].append(values['least'])
            data2['Category'].append('Best')
            data2['Dataset'].append(dataset2_name)
        if 'more' in values:
            data2['Score'].append(values['more'])
            data2['Category'].append('Medium')
            data2['Dataset'].append(dataset2_name)
        if 'most' in values:
            data2['Score'].append(values['most'])
            data2['Category'].append('Worst')
            data2['Dataset'].append(dataset2_name)
    
    # Create DataFrames
    df1 = pd.DataFrame(data1)
    df2 = pd.DataFrame(data2)
    combined_df = pd.concat([df1, df2])
    
    # Function to remove outliers using IQR method
    def remove_outliers(df):
        clean_df = pd.DataFrame()
        
        # Process each dataset and category combination
        for dataset in df['Dataset'].unique():
            for category in df['Category'].unique():
                subset = df[(df['Dataset'] == dataset) & (df['Category'] == category)]
                
                # Calculate Q1, Q3, and IQR for this subset
                Q1 = subset['Score'].quantile(0.25)
                Q3 = subset['Score'].quantile(0.75)
                IQR = Q3 - Q1
                
                # Define outlier boundaries
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                # Filter out outliers
                filtered = subset[(subset['Score'] >= lower_bound) & 
                                 (subset['Score'] <= upper_bound)]
                
                # Add to clean dataframe
                clean_df = pd.concat([clean_df, filtered])
                
                # Report number of outliers removed
                outliers_removed = len(subset) - len(filtered)
                if outliers_removed > 0:
                    print(f"Removed {outliers_removed} outliers from {dataset} - {category}")
        
        return clean_df
    
    # Remove outliers
    filtered_df = remove_outliers(combined_df)
    
    # Set figure size
    plt.figure(figsize=(14, 12))
    
    # Create boxplot with separate colors for each dataset
    palette = {dataset1_name: '#6495ED', dataset2_name: '#FF8C00'}  # Cornflower Blue and Dark Orange
    
    # Create boxplot using seaborn
    sns.boxplot(x='Category', y='Score', hue='Dataset', data=filtered_df, palette=palette)
    
    # Add a strip plot with matching colors
    sns.stripplot(x='Category', y='Score', hue='Dataset', data=filtered_df, 
                 dodge=True, alpha=0.5, size=4, palette=palette)
    
    # Fix the double legend issue
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles[:2], labels[:2], fontsize=18, loc='upper right')
    
    # Add labels and title
    plt.ylabel('IndirectScore', fontsize=26, labelpad=25)
    plt.xlabel('Level of Directness of Tutor\'s Last Question', fontsize=26, labelpad=25)
    plt.title('Distribution of IndirectScore Across Variants (Outliers Removed)', fontsize=28, pad=25)
    
    # Increase font sizes
    plt.xticks(fontsize=24)
    plt.yticks(fontsize=24)

    # Add grid for better readability
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Show the plot
    plt.tight_layout()
    plt.savefig('log_probability_boxplot_no_outliers.png', dpi=300)
    plt.show()
    
    # Print summary statistics for each dataset
    print(f"\nSummary Statistics for {dataset1_name} (Outliers Removed):")
    dataset1_filtered = filtered_df[filtered_df['Dataset'] == dataset1_name]
    print(dataset1_filtered.pivot(columns='Category', values='Score').describe())
    
    print(f"\nSummary Statistics for {dataset2_name} (Outliers Removed):")
    dataset2_filtered = filtered_df[filtered_df['Dataset'] == dataset2_name]
    print(dataset2_filtered.pivot(columns='Category', values='Score').describe())

In [None]:
create_boxplot_no_outliers(qwen, codellama, 'Qwen', 'CodeLlama')