In [None]:
df_cleaned = pd.read_csv("../datasets/vte_df_cleaned.xls", low_memory=False)

### Duplicate Check

In [None]:
num_duplicates = df_cleaned.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

### Numeric columns Statistics

In [None]:
df_cleaned[numeric_cols].describe().T

### Creating the Classification Outcome


In [None]:
df_cleaned["any_vte_result"] = (
    (df_cleaned["vte_i82_after-diagnosis_binary"] == 1) |
    (df_cleaned["pe_i26_after-diagnosis_binary"] == 1) |
    (df_cleaned["dvt_i82.4_after-diagnosis_binary"] == 1)
).astype(int).astype("category")

### Baseline and Follow-up Characteristics by VTE and Mortality Outcomes


In [None]:
# VTE:

categorical_cols_filtered = [col for col in category_cols if col != "any_vte_result"]
baseline_vars = [col for col in numeric_cols + categorical_cols_filtered if col in df_cleaned.columns]
categorical_cols = [col for col in categorical_cols_filtered if col in baseline_vars]
# Create VTE/no VTE statistics table (later divided into two tables: Table 1 for baseline features and Table 2 for follow-up features)
table1 = TableOne(
    df_cleaned,
    columns=baseline_vars,
    categorical=categorical_cols,
    groupby="any_vte_result",
    pval=True
)

# Mortality:

categorical_cols_table2 = [col for col in category_cols if col != "death_within_18_months"]
baseline_vars2 = [col for col in numeric_cols + categorical_cols_table2 if col in df_cleaned.columns]
# Create Death/no Death statistics table (later divided into two tables: Table 3 for baseline features and Table 4 for follow-up features)
table2 = TableOne(
    df_cleaned,
    columns=baseline_vars2,
    categorical=categorical_cols_table2,
    groupby="death_within_18_months",
    pval=True
)

### Missing Values and IQR EDA Check

In [None]:
# Missing values check
missing_pct = df_cleaned.isnull().sum()/df_cleaned.shape[0]*100
missing_pct[missing_pct > 0]

# IQR check for numeric columns
iqr_values = [(col, round(df_cleaned[col].quantile(0.75) - df_cleaned[col].quantile(0.25),2)) for col in numeric_cols]

### Correlation Analysis

In [None]:
# Numeric VS Numeric

significant_pairs = []

for x1, x2 in numeric_pairs:
    df_pair = df_cleaned[[x1, x2]].dropna()

    if df_pair.shape[0] < 3:
        continue

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        corr, p = stats.spearmanr(df_pair[x1], df_pair[x2])

    if p <= 0.05 and abs(corr) >= 0.5:
        significant_pairs.append((x1, x2, corr, p))

n = len(significant_pairs)
cols = 2
rows = (n + 1) // cols
fig, ax = plt.subplots(rows, cols, figsize=(7 * cols, 5 * rows))
ax = ax.flatten()

for i, (x1, x2, corr, p) in enumerate(significant_pairs):
    sns.scatterplot(data=df_cleaned, x=x1, y=x2, ax=ax[i])
    ax[i].set_title(f'{x1} vs {x2}\nCorr: {corr:.3f}, p-value: {p:.3f}', color='red')
    ax[i].set_xlabel("")
    ax[i].set_ylabel("")
    ax[i].tick_params(labelsize=8)

for j in range(i + 1, len(ax)):
    fig.delaxes(ax[j])

plt.tight_layout()
plt.show()

# Categorical vs Categorical

def cramers_v_with_pval(x, y):
    table = pd.crosstab(x, y)
    chi2, p, _, _ = chi2_contingency(table)
    n = table.sum().sum()
    r, k = table.shape
    if min(r, k) <= 1:
        return np.nan, np.nan
    cramers_v = np.sqrt(chi2 / (n * (min(r, k) - 1)))
    return cramers_v, p

results = []

for var1, var2 in combinations(category_cols, 2):
    v, p = cramers_v_with_pval(df_cleaned[var1], df_cleaned[var2])
    if not np.isnan(v) and v > 0.5 and p <= 0.05:
        results.append((var1, var2, round(v, 3), round(p, 4)))


# Numerical VS Categorical

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()
plot_index = 0

for num_var, cat_var in num_cat_pairs:
    temp_df = df_cleaned[[num_var, cat_var]].dropna()
    if temp_df.empty or len(temp_df[cat_var].unique()) < 2:
        continue

    cats = temp_df[cat_var].unique()

    if len(cats) == 2:
        group1 = temp_df[temp_df[cat_var] == cats[0]][num_var]
        group2 = temp_df[temp_df[cat_var] == cats[1]][num_var]
        stat, p = stats.mannwhitneyu(group1, group2)
    else:
        groups = [temp_df[temp_df[cat_var] == cat][num_var] for cat in cats]
        stat, p = stats.kruskal(*groups)

    if p <= 0.05:
        try:
            if len(cats) == 2 and pd.api.types.is_numeric_dtype(temp_df[cat_var]):
                corr, _ = stats.pointbiserialr(temp_df[cat_var], temp_df[num_var])
            else:
                corr = round(stat / (stat + (len(temp_df) - len(cats))), 3)
        except:
            corr = None

        x_name = display_names.get(cat_var, cat_var)
        y_name = display_names.get(num_var, num_var)

        sns.boxplot(data=temp_df, x=cat_var, y=num_var, ax=axes[plot_index])
        axes[plot_index].set_xlabel(x_name)
        axes[plot_index].set_ylabel(y_name)
        title = f"{y_name} vs {x_name}\np = {p:.3f}"
        if corr is not None:
            title += f", correlation ≈ {corr:.2f}"
        axes[plot_index].set_title(title, color='red')
        plot_index += 1

        if plot_index >= len(axes):
            break

for i in range(plot_index, len(axes)):
    fig.delaxes(axes[i])

fig.tight_layout()
plt.show()