# Step 2 — Exploratory Data Analysis (EDA): Univariate Patterns

This step examines the Medication sheet one variable at a time to:

- Identify numeric fields and plot their distributions.
- Review categorical fields important for medication-error analysis.
- Count occurrences of `Pattern Specifics`, focusing on the most frequent categories.

This mirrors the **Univariate EDA** step from the loan assignment but adapted to **clinical medication-error patterns**.

In [None]:
# STEP 2 – Exploratory Data Analysis (EDA) – Univariate Patterns

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# ---------------------------------------------------------
# Load data (assumes file already placed in repository or Colab)
# ---------------------------------------------------------
try:
    med = pd.read_excel('Krista 240726 Final.xlsx', sheet_name='Medication')
except FileNotFoundError:
    med = pd.read_csv('Krista 240726 Final.xlsx - Medication.csv')

# ---------------------------------------------------------
# 2.1 Identify numeric columns and plot histograms
# ---------------------------------------------------------
num_cols = med.select_dtypes(include=["int64", "float64"]).columns.tolist()
print("Numeric columns detected:", num_cols)

if len(num_cols) > 0:
    plt.figure(figsize=(15, 8))
    n_cols = 3
    n_rows = (len(num_cols) + n_cols - 1) // n_cols

    for i, col in enumerate(num_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.histplot(med[col], bins=30, kde=True)
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")

    plt.tight_layout()
    plt.show()
else:
    print("No numeric columns found to plot.")

# ---------------------------------------------------------
# 2.2 Categorical fields: Certificate, Branch, Medication, Pattern, Pattern Specifics
# ---------------------------------------------------------
candidate_cat_cols = [
    "Certificate",
    "Branch",
    "Medication 1",
    "Pattern",
    "Pattern Specifics"
]

cat_cols = [c for c in candidate_cat_cols if c in med.columns]
print("Categorical columns used for EDA:", cat_cols)

for col in cat_cols:
    # For high-cardinality columns, show top 15 categories
    value_counts = med[col].value_counts().head(15)

    plt.figure(figsize=(8, 4))
    sns.barplot(x=value_counts.values, y=value_counts.index, orient='h')
    plt.title(f"Top {len(value_counts)} categories for {col}")
    plt.xlabel("Count")
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()
