In [287]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

In [288]:
chatgptpreds = pd.read_csv("../predicted_data/chatgpt35_with_cc.csv")
mistralpreds = pd.read_csv("../predicted_data/mistralpreds_with_cc.csv")

In [289]:
# drop mistral preds with -9999 from rating column
mistralpreds = mistralpreds.drop(mistralpreds[mistralpreds["rating"] == -9999].index)
chatgptpreds = chatgptpreds.drop(chatgptpreds[chatgptpreds["rating"] == -9999].index)

In [290]:
# get top 100 predictions from each model
chatgpt_worst100 = chatgptpreds.sort_values(by="error", ascending=False).head(200)
mistral_worst100 = mistralpreds.sort_values(by="error", ascending=False).head(200)

In [291]:
XTest = pd.read_csv("../predicted_data/Xtest.csv")
XTest = XTest[XTest["rating"] != -9999]

In [292]:
num_cols = [
    "temperature",
    "heartrate",
    "o2sat",
    "sbp",
    "dbp",
    "resprate",
    "acuity",
    "age_on_adm",
    "rating",
]
cat_cols = ["arrival_transport"]

In [293]:
# use pandas describe to compare the distributions of the top 100 worst predictions with XTest, save to all in one df
chatgpt_worst100_stats = chatgpt_worst100.describe()
mistral_worst100_stats = mistral_worst100.describe()
XTest_stats = XTest.describe()

# append chatgpt, mistral, and XTest stats column names for easy ordering
chatgpt_worst100_stats.columns = [
    col + "_chatgpt" for col in chatgpt_worst100_stats.columns
]
mistral_worst100_stats.columns = [
    col + "_mistral" for col in mistral_worst100_stats.columns
]
XTest_stats.columns = [col + "_XTest" for col in XTest_stats.columns]


# calculate the t and p value for each column in the top 100 worst predictions
for col in num_cols:
    chatgpt_worst100_stats[col + "_chatgpt_p"] = chatgpt_worst100_stats[
        col + "_chatgpt"
    ].apply(lambda x: (XTest[col] > x).sum() / len(XTest[col]))
    mistral_worst100_stats[col + "_mistral_p"] = mistral_worst100_stats[
        col + "_mistral"
    ].apply(lambda x: (XTest[col] > x).sum() / len(XTest[col]))
    chatgpt_worst100_stats[col + "_chatgpt_t"] = chatgpt_worst100_stats[
        col + "_chatgpt"
    ].apply(lambda x: (XTest[col] - x).mean())
    mistral_worst100_stats[col + "_mistral_t"] = mistral_worst100_stats[
        col + "_mistral"
    ].apply(lambda x: (XTest[col] - x).mean())

    # calcute percentage of deviation from XTest
    chatgpt_worst100_stats[col + "_chatgpt_percent"] = chatgpt_worst100_stats[
        col + "_chatgpt"
    ].apply(lambda x: (XTest[col] - x).mean() / XTest[col].mean())
    mistral_worst100_stats[col + "_mistral_percent"] = mistral_worst100_stats[
        col + "_mistral"
    ].apply(lambda x: (XTest[col] - x).mean() / XTest[col].mean())

all_stats = pd.concat(
    [chatgpt_worst100_stats, mistral_worst100_stats, XTest_stats], axis=1
)

In [294]:
# order columns by prefix before _
all_stats = all_stats.reindex(sorted(all_stats.columns), axis=1)

In [295]:
all_stats.to_csv("../predicted_data/top200_worst_predictions_stats.csv")

In [296]:
chatgpt_top200 = chatgptpreds.sort_values(by="error", ascending=True).head(200)
mistral_top200 = mistralpreds.sort_values(by="error", ascending=True).head(200)

In [297]:
chatgpt_top200_stats = chatgpt_top200.describe()
mistral_top200_stats = mistral_top200.describe()

chatgpt_top200_stats.columns = [
    col + "_chatgpt" for col in chatgpt_top200_stats.columns
]
mistral_top200_stats.columns = [
    col + "_mistral" for col in mistral_top200_stats.columns
]

for col in num_cols:
    chatgpt_top200_stats[col + "_chatgpt_p"] = chatgpt_top200_stats[
        col + "_chatgpt"
    ].apply(lambda x: (XTest[col] > x).sum() / len(XTest[col]))
    mistral_top200_stats[col + "_mistral_p"] = mistral_top200_stats[
        col + "_mistral"
    ].apply(lambda x: (XTest[col] > x).sum() / len(XTest[col]))
    mistral_top200_stats[col + "_chatgpt_t"] = chatgpt_top200_stats[
        col + "_chatgpt"
    ].apply(lambda x: (XTest[col] - x).mean())
    mistral_top200_stats[col + "_mistral_t"] = mistral_top200_stats[
        col + "_mistral"
    ].apply(lambda x: (XTest[col] - x).mean())

    # calcute percentage of deviation from XTest
    chatgpt_top200_stats[col + "_chatgpt_percent"] = chatgpt_top200_stats[
        col + "_chatgpt"
    ].apply(lambda x: (XTest[col] - x).mean() / XTest[col].mean())
    mistral_top200_stats[col + "_mistral_percent"] = mistral_top200_stats[
        col + "_mistral"
    ].apply(lambda x: (XTest[col] - x).mean() / XTest[col].mean())

all_stats_top200 = pd.concat(
    [chatgpt_top200_stats, mistral_top200_stats, XTest_stats], axis=1
)

# order by column names for easy comparison
all_stats_top200 = all_stats_top200.reindex(sorted(all_stats_top200.columns), axis=1)

In [298]:
all_stats_top200.to_csv("../predicted_data/top200_best_predictions_stats.csv")

In [299]:
# remove outliers in num_cols for better visualization
k = 2
outlier_col = num_cols.copy()
outlier_col.remove("rating")
outlier_col.remove("acuity")
outlier_col.remove("age_on_adm")


for col in outlier_col:
    Q1 = XTest[col].quantile(0.25)
    Q3 = XTest[col].quantile(0.75)
    IQR = Q3 - Q1
    XTest = XTest[(XTest[col] >= (Q1 - k * IQR)) & (XTest[col] <= (Q3 + k * IQR))]

In [314]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


def compare_distributions_violinplot(
    data1,
    data2,
    columns,
    data1_label="Data 1",
    data2_label="Data 2",
    title="Distributions of numerical features",
    model_name="Model",
    order_type="Best",
):
    # Set the style of seaborn
    sns.set_theme(style="whitegrid")

    # Iterate over the columns and create violinplots for each feature
    for col in columns:
        # Create a new figure for each column
        plt.figure(figsize=(10, 6))

        # apply log scale if max value is greater than 30
        needs_log = max(data1[col].max(), data2[col].max()) > 30

        # Create a new DataFrame for seaborn to handle
        combined_data = pd.DataFrame(
            {data1_label: data1[col], data2_label: data2[col]}
        ).melt(var_name="Dataset", value_name=col)

        # Create a violin plot
        ax = sns.violinplot(x="Dataset", y=col, data=combined_data)

        # Apply log scale if necessary
        if needs_log:
            ax.set_yscale("log")

        # Set plot title
        plt.title(f"{title}: {col}")

        # Save the plot
        plt.savefig(f"./{model_name}_{order_type}_{col}_violinplot.png")

        # Show the plot
        plt.show()

In [315]:
def compare_distributions_hist(
    data1,
    data2,
    columns,
    data1_label="Data 1",
    data2_label="Data 2",
    title="Distributions of numerical features",
    figsize=(10, 30),
):
    # Set the style of seaborn
    sns.set_theme(style="whitegrid")

    # Create a figure and a set of subplots
    fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=figsize)

    # Adjust the spacing between subplots
    fig.tight_layout(pad=5.0)

    # Set the title of the figure
    fig.suptitle(title, y=1.02)

    # Iterate over the columns and create histograms for each feature
    for i, col in enumerate(columns):
        # apply log scale if max value is greater than 30
        needs_log = max(data1[col].max(), data2[col].max()) > 30

        # Create a histogram for the feature
        sns.histplot(data1[col], ax=axes[i], label=data1_label, color="blue", kde=True)
        sns.histplot(data2[col], ax=axes[i], label=data2_label, color="red", kde=True)
        axes[i].set_title(col)
        axes[i].legend()

    # Show the plot
    plt.show()

In [None]:
# make a grid of plots that compares the distribution of numerical features for the worst 100 predictions for mistral and test data
compare_distributions_violinplot(
    mistral_worst100,
    XTest,
    num_cols,
    data1_label="Mistral",
    data2_label="Test Data",
    title="Distributions of numerical features for Mistral worst 100 predictions and test data",
    model_name="Mistral",
    order_type="worst",
)

In [None]:
# make a grid of plots that compares the distribution of numerical features for the worst 100 predictions for chatgpt and test data
compare_distributions_violinplot(
    chatgpt_worst100,
    XTest,
    num_cols,
    data1_label="ChatGPT",
    data2_label="Test Data",
    title="Distributions of numerical features for ChatGPT worst 100 predictions and test data",
    model_name="ChatGPT",
    order_type="worst",
)

In [None]:
chatgpt_best = chatgptpreds.sort_values(by="error", ascending=True).head(200)
mistral_best = mistralpreds.sort_values(by="error", ascending=True).head(200)

In [None]:
compare_distributions_violinplot(
    mistral_best,
    XTest,
    num_cols,
    data1_label="Mistral",
    data2_label="Test Data",
    title="Distributions of numerical features for Mistral best 100 predictions and test data",
    model_name="Mistral",
    order_type="best",
)

In [None]:
compare_distributions_violinplot(
    chatgpt_best,
    XTest,
    num_cols,
    data1_label="Mistral",
    data2_label="Test Data",
    title="Distributions of numerical features for Mistral best 100 predictions and test data",
    model_name="ChatGPT",
    order_type="best",
)

In [None]:
# compare the arrival_transport distributions with chi squared test
from scipy.stats import chi2_contingency

# get the counts of each category in the arrival_transport column for the worst 100 predictions
mistral_worst100_counts = mistral_worst100["arrival_transport"].value_counts()
chatgpt_worst100_counts = chatgpt_worst100["arrival_transport"].value_counts()

# get the counts of each category in the arrival_transport column for the test data
XTest_counts = XTest["arrival_transport"].value_counts()

# create a contingency table
contingency_table_mistral = pd.concat(
    [mistral_worst100_counts, XTest_counts], axis=1, keys=["mistral", "XTest"]
).fillna(0)
contingency_table_chatgpt = pd.concat(
    [chatgpt_worst100_counts, XTest_counts], axis=1, keys=["chatgpt", "XTest"]
).fillna(0)

# perform the chi squared test
chi2_mistral, p_mistral, _, _ = chi2_contingency(contingency_table_mistral)
chi2_chatgpt, p_chatgpt, _, _ = chi2_contingency(contingency_table_chatgpt)

In [None]:
print(
    f"Chi-squared test for Mistral worst 100 predictions: chi2={chi2_mistral}, p={p_mistral}"
)
print(
    f"Chi-squared test for ChatGPT worst 100 predictions: chi2={chi2_chatgpt}, p={p_chatgpt}"
)

In [None]:
# print the value counts in percent for mistral, chatgpt and XTest
print("Mistral worst 100 predictions:")
print(mistral_worst100_counts / len(mistral_worst100))

print("ChatGPT worst 100 predictions:")
print(chatgpt_worst100_counts / len(chatgpt_worst100))

print("X_Test")
print(XTest_counts / len(XTest))

In [None]:
mistral_top200_counts = mistral_top200["arrival_transport"].value_counts()
chatgpt_top200_counts = chatgpt_top200["arrival_transport"].value_counts()

contingency_table_mistral_top200 = pd.concat(
    [mistral_top200_counts, XTest_counts], axis=1, keys=["mistral", "XTest"]
).fillna(0)
contingency_table_chatgpt_top200 = pd.concat(
    [chatgpt_top200_counts, XTest_counts], axis=1, keys=["chatgpt", "XTest"]
).fillna(0)

chi2_mistral_top200, p_mistral_top200, _, _ = chi2_contingency(
    contingency_table_mistral_top200
)
chi2_chatgpt_top200, p_chatgpt_top200, _, _ = chi2_contingency(
    contingency_table_chatgpt_top200
)

print(
    f"Chi-squared test for Mistral top 200 predictions: chi2={chi2_mistral_top200}, p={p_mistral_top200}"
)
print(
    f"Chi-squared test for ChatGPT top 200 predictions: chi2={chi2_chatgpt_top200}, p={p_chatgpt_top200}"
)

In [None]:
print("Mistral top 200 predictions:")
print(mistral_top200_counts / len(mistral_top200))
print("\n")

print("ChatGPT top 200 predictions:")
print(chatgpt_top200_counts / len(chatgpt_top200))
print("\n")

print("X_Test")
print(XTest_counts / len(XTest))
print("\n")

In [None]:
# check the average lenght of chiefcomplaint and compare
chatgpt_top200["chiefcomplaint_len"] = chatgpt_top200["chiefcomplaint"].apply(
    lambda x: len(x)
)
mistral_top200["chiefcomplaint_len"] = mistral_top200["chiefcomplaint"].apply(
    lambda x: len(x)
)

print(
    f"ChatGPT top 200 average chiefcomplaint length: {chatgpt_top200['chiefcomplaint_len'].mean()}"
)
print(
    f"Mistral top 200 average chiefcomplaint length: {mistral_top200['chiefcomplaint_len'].mean()}"
)

In [None]:
# get average word count of and compare
chatgpt_top200["chiefcomplaint_word_count"] = chatgpt_top200["chiefcomplaint"].apply(
    lambda x: len(x.split())
)
mistral_top200["chiefcomplaint_word_count"] = mistral_top200["chiefcomplaint"].apply(
    lambda x: len(x.split())
)

print(
    f"ChatGPT top 200 average chiefcomplaint word count: {chatgpt_top200['chiefcomplaint_word_count'].mean()}"
)
print(
    f"Mistral top 200 average chiefcomplaint word count: {mistral_top200['chiefcomplaint_word_count'].mean()}"
)

In [None]:
# same but for worst
chatgpt_worst100["chiefcomplaint_len"] = chatgpt_worst100["chiefcomplaint"].apply(
    lambda x: len(x)
)
mistral_worst100["chiefcomplaint_len"] = mistral_worst100["chiefcomplaint"].apply(
    lambda x: len(x)
)

print(
    f"ChatGPT worst 100 average chiefcomplaint length: {chatgpt_worst100['chiefcomplaint_len'].mean()}"
)
print(
    f"Mistral worst 100 average chiefcomplaint length: {mistral_worst100['chiefcomplaint_len'].mean()}"
)

In [None]:
chatgpt_worst100["chiefcomplaint_word_count"] = chatgpt_worst100[
    "chiefcomplaint"
].apply(lambda x: len(x.split()))
mistral_worst100["chiefcomplaint_word_count"] = mistral_worst100[
    "chiefcomplaint"
].apply(lambda x: len(x.split()))

print(
    f"ChatGPT worst 100 average chiefcomplaint word count: {chatgpt_worst100['chiefcomplaint_word_count'].mean()}"
)
print(
    f"Mistral worst 100 average chiefcomplaint word count: {mistral_worst100['chiefcomplaint_word_count'].mean()}"
)