In [None]:
import pandas as pd
import os
file_path = 'combined_data.csv'  
data = pd.read_csv(file_path)

mean_scores = data.groupby('Model').mean().reset_index()
mean_scores.to_csv('mean_scores_model_wise.csv', index=False)

print(mean_scores)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Data from the mean_scores DataFrame
data = {
    'Model': ['GPT-4 only (no RAG)', 'MedCPT w GPT-4', 'OpenAI based RAG'],
    'faithfulness': [0.025, 0.800, 0.625],
    'answer_correctness': [0.247741, 0.393940, 0.335267],
    'context_recall': [0.00, 0.60, 0.55],
    'context_precision': [0.00, 0.75, 0.80],
    'answer_relevancy': [0.804695, 0.778907, 0.703376],
    'ROUGE-1': [0.160930, 0.371680, 0.270215],
    'ROUGE-2': [0.049090, 0.226840, 0.200945],
    'ROUGE-L': [0.160930, 0.365430, 0.270215],
    'BLEU': [0.009360, 0.015430, 0.037955]
}

# Create a DataFrame
mean_scores = pd.DataFrame(data)

# Set 'Model' as the index
mean_scores.set_index('Model', inplace=True)

# Plot using a different color palette
colors = plt.cm.Accent.colors  # Use the 'tab10' palette
# colors = plt.cm.Paired.colors
mean_scores.T.plot(kind='bar', figsize=(12, 6), width=0.8, color=colors)

# Add title and labels
plt.title('Mean Scores by Metric and Model', fontsize=14)
plt.xlabel('Metrics', fontsize=12)
plt.ylabel('Mean Scores', fontsize=12)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.legend(title='Model', fontsize=10)
plt.tight_layout()

# Save the plot
plot_path = 'mean_scores_by_metric_and_model_with_tab10_palette.png'
plt.savefig(plot_path, dpi=300)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the data
file_path = 'combined_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Filter for specific models
filtered_data = data[data['Model'].isin(['OpenAI based RAG', 'MedCPT w GPT-4'])]

# Identify metric columns (exclude 'Model')
metric_columns = [col for col in filtered_data.columns if col != 'Model']

# Step 2: Initialize results storage
ks_test_results = []

# Step 3: Perform K-S test for each metric
for metric in metric_columns:
    # Get data for the two models
    group1 = filtered_data[filtered_data['Model'] == 'OpenAI based RAG'][metric].dropna()
    group2 = filtered_data[filtered_data['Model'] == 'MedCPT w GPT-4'][metric].dropna()
    
    # Perform Kolmogorov-Smirnov test
    if len(group1) > 1 and len(group2) > 1:  # Ensure both groups have sufficient data
        ks_stat, ks_p = ks_2samp(group1, group2)
    else:
        ks_stat, ks_p = (np.nan, np.nan)
    
    # Store results
    ks_test_results.append({
        'Metric': metric,
        'K-S Statistic': ks_stat,
        'P-Value': ks_p
    })

# Step 4: Save results to a DataFrame and CSV
ks_results_df = pd.DataFrame(ks_test_results)
ks_results_df.to_csv('ks_test_results.csv', index=False)
print(ks_results_df)

# Step 5: Visualization (ECDF for Distribution Comparison)
for metric in metric_columns:
    plt.figure(figsize=(10, 6))
    sns.ecdfplot(filtered_data[filtered_data['Model'] == 'OpenAI based RAG'][metric].dropna(), 
                 color='blue', label='OpenAI based RAG')
    sns.ecdfplot(filtered_data[filtered_data['Model'] == 'MedCPT w GPT-4'][metric].dropna(), 
                 color='orange', label='MedCPT w GPT-4')
    plt.title(f'ECDF of {metric}')
    plt.xlabel(metric)
    plt.ylabel('ECDF')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the data
file_path = 'combined_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Filter for specific models
filtered_data = data[data['Model'].isin(['OpenAI based RAG', 'MedCPT w GPT-4'])]

# Identify metric columns (exclude 'Model')
metric_columns = [col for col in filtered_data.columns if col != 'Model']

# Step 2: Initialize results storage
shapiro_results = []

# Step 3: Perform Shapiro-Wilk test for normality for each metric
for metric in metric_columns:
    # Get data for the two models
    group1 = filtered_data[filtered_data['Model'] == 'OpenAI based RAG'][metric].dropna()
    group2 = filtered_data[filtered_data['Model'] == 'MedCPT w GPT-4'][metric].dropna()
    
    # Perform Shapiro-Wilk test if sample size is sufficient
    shapiro_g1_stat, shapiro_g1_p = shapiro(group1) if len(group1) >= 3 else (np.nan, np.nan)
    shapiro_g2_stat, shapiro_g2_p = shapiro(group2) if len(group2) >= 3 else (np.nan, np.nan)
    
    # Store results
    shapiro_results.append({
        'Metric': metric,
        'Model': 'OpenAI based RAG',
        'Shapiro-Wilk Statistic': shapiro_g1_stat,
        'P-Value': shapiro_g1_p
    })
    shapiro_results.append({
        'Metric': metric,
        'Model': 'MedCPT w GPT-4',
        'Shapiro-Wilk Statistic': shapiro_g2_stat,
        'P-Value': shapiro_g2_p
    })

# Step 4: Save results to a DataFrame and CSV
shapiro_df = pd.DataFrame(shapiro_results)
shapiro_df.to_csv('shapiro_wilk_results.csv', index=False)
print(shapiro_df)

# Step 5: Visualization (Histograms for Normality Check)
for metric in metric_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(filtered_data[filtered_data['Model'] == 'OpenAI based RAG'][metric].dropna(), 
                 kde=True, color='blue', label='OpenAI based RAG', bins=20)
    sns.histplot(filtered_data[filtered_data['Model'] == 'MedCPT w GPT-4'][metric].dropna(), 
                 kde=True, color='orange', label='MedCPT w GPT-4', bins=20)
    plt.title(f'Histogram of {metric}')
    plt.xlabel(metric)
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu, kruskal, ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load your data
data = pd.read_csv('combined_data.csv')

# Filter for specific models
filtered_data = data[data['Model'].isin(['OpenAI based RAG', 'MedCPT w GPT-4'])]

# Define your metrics
metrics = ['faithfulness', 'answer_relevancy', 'BLEU']

# Mann-Whitney U Test for non-parametric comparison between two groups
for metric in metrics:
    group1 = filtered_data[filtered_data['Model'] == 'OpenAI based RAG'][metric].dropna()
    group2 = filtered_data[filtered_data['Model'] == 'MedCPT w GPT-4'][metric].dropna()
    stat, p = mannwhitneyu(group1, group2)
    print(f'Mann-Whitney U Test for {metric}: Statistic={stat}, p-value={p}')

# Welch's ANOVA for metrics with unequal variances
for metric in metrics:
    model = ols(f'{metric} ~ Model', data=filtered_data).fit()
    welch_anova = sm.stats.anova_lm(model, typ=2, robust='hc3')
    print(f'Welch\'s ANOVA for {metric}:\n', welch_anova)
