# EDA of diabetes_012_health_indicators_BRFSS2015

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_raw = pd.read_csv("../data/raw/diabetes_012_health_indicators_BRFSS2015.csv")

## First glance at raw data

In [None]:
display(df_raw.shape)

In [None]:
display(df_raw.info())

#### Observation
* There are no missing values.
* All columns contain numbers, although some should be interpreted as categories.

In [None]:
step = 8

for i in range(0, len(df_raw.columns), step):
    display(df_raw[df_raw.columns[i : i + step]].head())

In [None]:
display(df_raw[df_raw.select_dtypes(include="number").columns].describe())

#### Observation
* `BMI` has a high maximum. Possibly an outlier.
* `MentHlth` and `PhysHlth` are negatively skewed. Both have more than 50% zeroes.

## Converted to intended datatype

### Data Dictionary – Diabetes Health Indicators

| Column                | Datatype   | Description                                                                 |
|-----------------------|------------|-----------------------------------------------------------------------------|
| `Diabetes_012`        | nominal    | 0 = no diabetes, 1 = prediabetes, 2 = diabetes                              |
| `HighBP`              | nominal    | 0 = no high blood pressure, 1 = high blood pressure                         |
| `HighChol`            | nominal    | 0 = no high cholesterol, 1 = high cholesterol                               |
| `CholCheck`           | nominal    | 0 = no cholesterol check in 5 years, 1 = yes                                |
| `BMI`                 | float      | Body Mass Index                                                             |
| `Smoker`              | nominal    | Smoked ≥100 cigarettes in lifetime: 0 = no, 1 = yes                         |
| `Stroke`              | nominal    | Ever told had a stroke: 0 = no, 1 = yes                                     |
| `HeartDiseaseorAttack`| nominal    | CHD or MI history: 0 = no, 1 = yes                                          |
| `PhysActivity`        | nominal    | Physical activity (last 30 days, non-job): 0 = no, 1 = yes                  |
| `Fruits`              | nominal    | Consumes fruit ≥1×/day: 0 = no, 1 = yes                                     |
| `Veggies`             | nominal    | Consumes vegetables ≥1×/day: 0 = no, 1 = yes                                |
| `HvyAlcoholConsump`   | nominal    | Heavy drinker: 0 = no, 1 = yes                                              |
| `AnyHealthcare`       | nominal    | Has health coverage: 0 = no, 1 = yes                                        |
| `NoDocbcCost`         | nominal    | Missed doctor due to cost (last 12 months): 0 = no, 1 = yes                 |
| `GenHlth`             | ordinal    | General health: 1 = excellent, ..., 5 = poor                                |
| `MentHlth`            | int        | Days of poor mental health (last 30 days), 0–30                             |
| `PhysHlth`            | int        | Days of poor physical health (last 30 days), 0–30                           |
| `DiffWalk`            | nominal    | Serious difficulty walking/stairs: 0 = no, 1 = yes                          |
| `Sex`                 | nominal    | 0 = female, 1 = male                                                        |
| `Age`                 | ordinal    | Age category: 1 = 18–24, ..., 13 = 80+                                      |
| `Education`           | ordinal    | Education level: 1 = none to kindergarten, ..., 6 = college grad           |
| `Income`              | ordinal    | Income level: 1 = < $10k, ..., 8 = ≥ $75k                                   |



### Conversion

In [None]:
cat_cols = [
    "Diabetes_012",
    "HighBP",
    "HighChol",
    "CholCheck",
    "Smoker",
    "Stroke",
    "HeartDiseaseorAttack",
    "PhysActivity",
    "Fruits",
    "Veggies",
    "HvyAlcoholConsump",
    "AnyHealthcare",
    "NoDocbcCost",
    "GenHlth",
    "DiffWalk",
    "Sex",
    "Age",
    "Education",
    "Income",
]

df_raw[cat_cols] = df_raw[cat_cols].astype("int").astype("category")

### Info, Describe, Overview

In [None]:
df_raw.info()

In [None]:
display(df_raw.loc[:, df_raw.select_dtypes(include="number").columns].describe())

In [None]:
def overview(df):
    """
    Creates and prints an overview of the DataFrame including data types, counts, missing values,
    unique values, and some basic statistics.
    """
    from pandas.api.types import is_numeric_dtype, is_categorical_dtype

    def normalized_entropy_cat(series: pd.Series) -> float:
        """
        Compute the normalized Shannon entropy of a categorical distribution.

        Returns 0 if only one class is present, 1 for perfectly uniform distribution.
        """
        counts = series.value_counts(normalize=True)
        entropy = -np.sum(counts * np.log2(counts))
        max_entropy = np.log2(len(counts)) if len(counts) > 1 else 1
        return entropy / max_entropy

    display(
        pd.DataFrame(
            {
                "dtype": df.dtypes,
                "total": df.count(),
                "missing": df.isna().sum(),
                "missing%": df.isna().mean() * 100,
                "n_uniques": df.nunique(),
                "uniques%": df.nunique() / df.shape[0] * 100,
                "uniques": [sorted(df[col].unique()) for col in df.columns],
                "non-numeric": [
                    list(
                        df[col][pd.to_numeric(df[col], errors="coerce").isna()].unique()
                    )
                    for col in df.columns
                ],
                "dev from mean": [
                    (
                        (
                            round(
                                ((df[col].mean() - df[col].min()) / df[col].std()), 1
                            ),
                            round(
                                ((df[col].max() - df[col].mean()) / df[col].std()), 1
                            ),
                        )
                        if is_numeric_dtype(df[col])
                        else pd.NA
                    )
                    for col in df.columns
                ],
                "most/least freq": [
                    (
                        (
                            {
                                df[col].value_counts().index[i]: list(
                                    df[col].value_counts()
                                )[i] for i in (0, -1)
                            }
                           
                        )
                        if not is_numeric_dtype(df[col])
                        else pd.NA
                    )
                    for col in df.columns
                ],
                "norm entropy": [
                    round(normalized_entropy_cat(df[col]), 2)
                    if isinstance(df[col].dtype, pd.CategoricalDtype)
                    else pd.NA
                    for col in df.columns
                ],
            }
        )
    )


overview(df_raw)

#### Observation
* `Age`, `Education`, `Income`, `Diabetes_012` are categories with more than two values. All other categories are logical.
* The target `Diabetes_012` is heavily imbalanced.
* `BMI` contains values further than 10 standard deviations from the mean. Check for outliers.
* `HighBP`, `HighChol`, `Smoker`, `PhysActivity`, `Fruits`, `Veggies`, `GenHlth`, `DiffWalk`, `Sex`, `Age`, `Education`, `Income` are no more than slightly unbalanced, all other categories are highly unbalanced.

### Crosstabs

#### Observation
* Most logical categories show a difference of more than 5 percentage points of healthy people between the two categories.
* Nominal categories suggest a correlation between increasing values and state of health.

In [None]:
%%time

categorical_features = df_raw.select_dtypes(include=["object", "category"]).columns.tolist()

df_raw_copy = df_raw[categorical_features].copy()

categorical_features = [col for col in categorical_features if col != "Diabetes_012"]
# categorical_features= ["HighBP"]
feature_ct = len(categorical_features)

fig, axs = plt.subplots(feature_ct, 1, figsize=(14, 2 * feature_ct))
if feature_ct == 1:
    axs = [axs]  # ensure axs is always iterable

for ax, feature in zip(axs, categorical_features):

    ct = pd.crosstab(index=df_raw_copy[feature], columns=df_raw_copy["Diabetes_012"])

    ct_ratio = ct.div(ct.sum(axis=1), axis=0).fillna(0)

    ct_ratio.plot(kind="barh", stacked=True, ax=ax, legend=False)

    for i, (index, row) in enumerate(ct_ratio.iterrows()):
        ratio = row.get(0, 0)
        ax.text(1.01, i, f"{ratio:.1%}", va="center", fontsize=8)

    ax.set_xlabel("ratio")
    ax.set_ylabel(feature)
    ax.set_title(f"ratio of Diabetes_012 in {feature}")
    ax.legend(title="Diabetes_012", loc="lower left")

fig.tight_layout()
plt.show()

# df is not needed anymore
del df_raw_copy

### Pairplots

#### Observation
* No relevant differences are visible between the pairplots
* `MenHlth` and `PhysHlth` have more entries on counts divisible by 5. Possibly just the human need for beauty.

#### All columns

In [None]:
%%time
sns.pairplot(data=df_raw, plot_kws={'alpha': 0.2}, hue='Diabetes_012')
plt.suptitle("Pairplots of all numeric features", y=1.02)
plt.show();

#### Pairplots Women

In [None]:
%%time
sns.pairplot(data=df_raw[df_raw["Sex"] == 0], plot_kws={'alpha': 0.2}, hue='Diabetes_012')
plt.suptitle("Pairplots of numeric features for women", y=1.02)
plt.show();

#### Pairplots Men

In [None]:
%%time
sns.pairplot(data=df_raw[df_raw["Sex"] == 1], plot_kws={'alpha': 0.2}, hue='Diabetes_012', )
plt.suptitle("Pairplots of numeric features for men", y=1.02)
plt.show();

## Correlations

#### Observations
* There are notable correlations between 
  * `GenHlth`, `PhysHlth` and `DiffWalk`. These seem reasonable.
  * `Education` and `Income`. Also reasonable.
* There are lower, but still notable correlations between
  * `Diabetes_012`, `GenHlth`
  * `GenHlth`, `Diabetes_012`, `HighBP`, `Income`
  * `MenHlth`, `GenHlth`
  * `MenHlth`, `PhysHlth`
  * `Income`, `GenHlth`, `DiffWalk`, last one somewhat surprising

#### Conclusion
Relevant correlations are present. Appropriate feature selection must be conducted.

In [None]:
corr = df_raw.corr()
mask = np.abs(corr) < 0.3
annot = corr.round(2).astype(str)
annot[mask] = ""

plt.figure(figsize=(16, 13))
sns.heatmap(
    corr,
    annot=annot,
    fmt="",
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
)

plt.title("Correlations between all columns", y=1.02)
plt.show();