In [1]:
import pandas as pd

In [2]:
csmpo_df = pd.read_csv("csmpo_df.csv")
csmso_df = pd.read_csv("csmso_df.csv")
mipo_df = pd.read_csv("mipo_df.csv")
miso_df = pd.read_csv("miso_df.csv")

final_df = (
    csmpo_df[["Unnamed: 0", "query", "market_type", "query_level", "product"]]
    .rename(columns={"Unnamed: 0": "query_id"})
    .assign(
        csmpo_mean=csmpo_df["csmpo_mean"].values,
        csmso_mean=csmso_df["csmso_mean"].values,
        mipo_mean=mipo_df["mipo_mean"].values,
        miso_mean=miso_df["miso_mean"].values,
    )
)

In [3]:
final_df

Unnamed: 0,query_id,query,market_type,query_level,product,csmpo_mean,csmso_mean,mipo_mean,miso_mean
0,0,What is the best smartphone?,Commodity / Saturated,General,Smartphone,0.631322,0.722222,0.740741,1.0
1,1,What is the best smartphone under 1000 dollars?,Commodity / Saturated,General & Price,Smartphone,0.58627,0.333333,0.791667,0.25
2,2,What is the best smartphone under 1000 dollars...,Commodity / Saturated,General & Price & Feature,Smartphone,0.51654,0.0,0.875,0.333333
3,3,What is the best smartphone under 1000 dollars...,Commodity / Saturated,General & Price & Feature & Feature,Smartphone,0.404563,0.0,0.583333,0.0
4,4,What is the best laptop?,Commodity / Saturated,General,Laptop,0.236019,0.577778,0.359259,0.666667
5,5,What is the best laptop under 1500 dollars?,Commodity / Saturated,General & Price,Laptop,0.063828,0.0,0.125,0.5
6,6,What is the best laptop under 1500 dollars wit...,Commodity / Saturated,General & Price & Feature,Laptop,0.055556,0.0,0.333333,0.833333
7,7,What is the best laptop under 1500 dollars wit...,Commodity / Saturated,General & Price & Feature & Feature,Laptop,0.080556,0.0,0.291667,0.166667
8,8,What is the best smartwatch?,Commodity / Saturated,General,Smartwatch,0.536111,0.55,0.791667,0.666667
9,9,What is the best smartwatch under 800 dollars?,Commodity / Saturated,General & Price,Smartwatch,0.091852,0.0,0.708333,1.0


In [4]:
from scipy.stats import kruskal

def create_statistical_summary_table(results_df):
    
    metrics = ["csmpo_mean", "csmso_mean", "mipo_mean", "miso_mean"]
    groupings = ["market_type", "query_level"]
    
    summary_data = []
    
    for metric in metrics:
        for grouping in groupings:
            # Get groups
            groups = []
            group_names = results_df[grouping].unique()
            
            for group_name in group_names:
                group_data = results_df[results_df[grouping] == group_name][metric].dropna()
                if len(group_data) > 0:
                    groups.append(group_data)
            
            # Perform Kruskal-Wallis test
            if len(groups) >= 2:
                H, p = kruskal(*groups)
                
                summary_data.append({
                    "Metric": metric,
                    "Comparison": grouping.replace("_", " ").title(),
                    "H-statistic": f"{H:.2f}",
                    "p-value": f"{p:.3f}" if p >= 0.001 else "< 0.001",
                    "Significant": "Yes*" if p < 0.05 else "No"
                })
    
    summary_df = pd.DataFrame(summary_data)
    
    return summary_df

In [5]:
create_statistical_summary_table(final_df)

Unnamed: 0,Metric,Comparison,H-statistic,p-value,Significant
0,csmpo_mean,Market Type,2.14,0.343,No
1,csmpo_mean,Query Level,9.4,0.024,Yes*
2,csmso_mean,Market Type,4.98,0.083,No
3,csmso_mean,Query Level,15.66,0.001,Yes*
4,mipo_mean,Market Type,3.23,0.199,No
5,mipo_mean,Query Level,8.0,0.046,Yes*
6,miso_mean,Market Type,1.09,0.579,No
7,miso_mean,Query Level,15.62,0.001,Yes*
