# EDA of diabetes_012_health_indicators_BRFSS2015

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_raw = pd.read_csv("../data/raw/diabetes_012_health_indicators_BRFSS2015.csv")

## First glance at raw data

In [None]:
display(df_raw.shape)

In [None]:
display(df_raw.info())

* There are no missing values.
* All columns contain numbers, although some should be interpreted as categories.

In [None]:
step = 8

for i in range(0, len(df_raw.columns), step):
    display(df_raw[df_raw.columns[i : i + step]].head())

In [None]:
display(df_raw[df_raw.select_dtypes(include="number").columns].describe())

* `BMI` has a high maximum. Possibly an outlier.
* `MentHlth` and `PhysHlth` are negatively skewed. Both have more than 50% zeroes.

## Converted to intended datatype

### Data Dictionary – Diabetes Health Indicators

| Column                | Description                                                                 |
|-----------------------|-----------------------------------------------------------------------------|
| `Diabetes_012`        | 0 = no diabetes, 1 = prediabetes, 2 = diabetes                              |
| `HighBP`              | 0 = no high blood pressure, 1 = high blood pressure                         |
| `HighChol`            | 0 = no high cholesterol, 1 = high cholesterol                               |
| `CholCheck`           | 0 = no cholesterol check in 5 years, 1 = yes                                |
| `BMI`                 | Body Mass Index                                                             |
| `Smoker`              | Smoked ≥100 cigarettes in lifetime: 0 = no, 1 = yes                         |
| `Stroke`              | Ever told had a stroke: 0 = no, 1 = yes                                     |
| `HeartDiseaseorAttack`| CHD or MI history: 0 = no, 1 = yes                                          |
| `PhysActivity`        | Physical activity (last 30 days, non-job): 0 = no, 1 = yes                  |
| `Fruits`              | Consumes fruit ≥1×/day: 0 = no, 1 = yes                                     |
| `Veggies`             | Consumes vegetables ≥1×/day: 0 = no, 1 = yes                                |
| `HvyAlcoholConsump`   | Heavy drinker: 0 = no, 1 = yes                                              |
| `AnyHealthcare`       | Has health coverage: 0 = no, 1 = yes                                        |
| `NoDocbcCost`         | Missed doctor due to cost (last 12 months): 0 = no, 1 = yes                 |
| `GenHlth`             | General health: 1 = excellent, ..., 5 = poor                                |
| `MentHlth`            | Days of poor mental health (last 30 days), 0–30                             |
| `PhysHlth`            | Days of poor physical health (last 30 days), 0–30                           |
| `DiffWalk`            | Serious difficulty walking/stairs: 0 = no, 1 = yes                          |
| `Sex`                 | 0 = female, 1 = male                                                         |
| `Age`                 | Age category: 1 = 18–24, ..., 13 = 80+                                       |
| `Education`           | Education level: 1 = none to kindergarten, ..., 6 = college grad            |
| `Income`              | Income level: 1 = < $10k, ..., 8 = ≥ $75k                                   |


### Conversion

In [None]:
cat_cols = [
    "Diabetes_012",
    "HighBP",
    "HighChol",
    "CholCheck",
    "Smoker",
    "Stroke",
    "HeartDiseaseorAttack",
    "PhysActivity",
    "Fruits",
    "Veggies",
    "HvyAlcoholConsump",
    "AnyHealthcare",
    "NoDocbcCost",
    "GenHlth",
    "DiffWalk",
    "Sex",
    "Age",
    "Education",
    "Income",
]

df_raw[cat_cols] = df_raw[cat_cols].astype("int").astype("category")

### Info, Describe, Overview

In [None]:
df_raw.info()

In [None]:
display(df_raw.loc[:, df_raw.select_dtypes(include="number").columns].describe())

In [None]:
def overview(df):
    """
    Creates and prints an overview of the DataFrame including data types, counts, missing values,
    unique values, and some basic statistics.
    """
    from pandas.api.types import is_numeric_dtype, is_categorical_dtype

    def normalized_entropy_cat(series: pd.Series) -> float:
        """
        Compute the normalized Shannon entropy of a categorical distribution.

        Returns 0 if only one class is present, 1 for perfectly uniform distribution.
        """
        counts = series.value_counts(normalize=True)
        entropy = -np.sum(counts * np.log2(counts))
        max_entropy = np.log2(len(counts)) if len(counts) > 1 else 1
        return entropy / max_entropy

    display(
        pd.DataFrame(
            {
                "dtype": df.dtypes,
                "total": df.count(),
                "missing": df.isna().sum(),
                "missing%": df.isna().mean() * 100,
                "n_uniques": df.nunique(),
                "uniques%": df.nunique() / df.shape[0] * 100,
                "uniques": [sorted(df[col].unique()) for col in df.columns],
                "non-numeric": [
                    list(
                        df[col][pd.to_numeric(df[col], errors="coerce").isna()].unique()
                    )
                    for col in df.columns
                ],
                "dev from mean": [
                    (
                        (
                            round(
                                ((df[col].mean() - df[col].min()) / df[col].std()), 1
                            ),
                            round(
                                ((df[col].max() - df[col].mean()) / df[col].std()), 1
                            ),
                        )
                        if is_numeric_dtype(df[col])
                        else pd.NA
                    )
                    for col in df.columns
                ],
                "most/least freq": [
                    (
                        (
                            {
                                df[col].value_counts().index[i]: list(
                                    df[col].value_counts()
                                )[i] for i in (0, -1)
                            }
                           
                        )
                        if not is_numeric_dtype(df[col])
                        else pd.NA
                    )
                    for col in df.columns
                ],
                "norm entropy": [
                    round(normalized_entropy_cat(df[col]), 2)
                    if isinstance(df[col].dtype, pd.CategoricalDtype)
                    else pd.NA
                    for col in df.columns
                ],
            }
        )
    )


overview(df_raw)

* `Age`, `Education`, `Income`, `Diabetes_012` are categories with more than two values. All other categories are logical.
* The target `Diabetes_012` is heavily imbalanced.
* `BMI` contains values further than 10 standard deviations from the mean. Check for outliers.
* `HighBP`, `HighChol`, `Smoker`, `Sex` are slightly unbalanced, all other categories are highly unbalanced.