# DS 2023 Final Project – Epilepsy Infographic: EDA Setup

This notebook prepares is based upon the https://www.kaggle.com/code/ukveteran/compedjma-epilepsy/input dataset for exploratory data analysis (EDA) and visualization. This data is composed of a trial on an experimental drug called Progabide.



In [1]:
# Imports
import os
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.precision", 3)

## 1. Load `epilepsy_enriched.csv`

In [2]:
file_path = "epilepsy.csv"

if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"Could not find {file_path} in the current directory. "
        "Make sure the enriched CSV is in the same folder as this notebook."
    )

df_raw = pd.read_csv(file_path)

print("Loaded epilepsy_enriched.csv")
print("Shape:", df_raw.shape)
df_raw.head()

Loaded epilepsy_enriched.csv
Shape: (236, 7)


Unnamed: 0.1,Unnamed: 0,treatment,base,age,seizure.rate,period,subject
0,1,placebo,11,31,5,1,1
1,110,placebo,11,31,3,2,1
2,112,placebo,11,31,3,3,1
3,114,placebo,11,31,3,4,1
4,2,placebo,11,30,3,1,2


## 2. Standardize column names

We:

- Strip whitespace  
- Make names lowercase  
- Replace spaces with underscores  
- Keep a record of original names

In [3]:
original_columns = df_raw.columns.tolist()
print("Original columns:", original_columns)

df = df_raw.copy()
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)

print("\nStandardized columns:", df.columns.tolist())
df.head()

Original columns: ['Unnamed: 0', 'treatment', 'base', 'age', 'seizure.rate', 'period', 'subject']

Standardized columns: ['unnamed:_0', 'treatment', 'base', 'age', 'seizure.rate', 'period', 'subject']


Unnamed: 0,unnamed:_0,treatment,base,age,seizure.rate,period,subject
0,1,placebo,11,31,5,1,1
1,110,placebo,11,31,3,2,1
2,112,placebo,11,31,3,3,1
3,114,placebo,11,31,3,4,1
4,2,placebo,11,30,3,1,2


## 3. Ensure core variables and data types

We expect (or hope) to see:

- `subject` (patient ID) – integer  
- `period` (follow-up period index) – integer  
- `treatment` (e.g., drug vs placebo) – category  
- `base` – numeric baseline seizure count  
- `seizure.rate` or `seizure_rate` – numeric seizure count per period  
- `change` – numeric difference from baseline  
- `pct_change` – numeric relative change from baseline  
- `responder_50` – optional boolean flag (≥50% reduction)


In [4]:
# Standardize seizure-rate column name
if "seizure.rate" in df.columns and "seizure_rate" not in df.columns:
    df = df.rename(columns={"seizure.rate": "seizure_rate"})

# Subject & period as ints where present
if "subject" in df.columns:
    df["subject"] = df["subject"].astype(int)

if "period" in df.columns:
    df["period"] = df["period"].astype(int)

# Treatment as category where present
if "treatment" in df.columns:
    df["treatment"] = df["treatment"].astype("category")

# Convert key numeric columns safely
num_cols = ["base", "seizure_rate", "change", "pct_change"]
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

df.dtypes

unnamed:_0         int64
treatment       category
base               int64
age                int64
seizure_rate       int64
period             int64
subject            int64
dtype: object

## 4. Structural checks: missingness & basic summary

In [5]:
print("=== Missing values per column ===")
print(df.isna().sum())

print("\n=== Numeric columns summary ===")
display(df.describe(include="number"))

print("\n=== Categorical columns summary ===")
display(df.describe(include="category"))

=== Missing values per column ===
unnamed:_0      0
treatment       0
base            0
age             0
seizure_rate    0
period          0
subject         0
dtype: int64

=== Numeric columns summary ===


Unnamed: 0,unnamed:_0,base,age,seizure_rate,period,subject
count,236.0,236.0,236.0,236.0,236.0,236.0
mean,251.86,31.22,28.339,8.263,2.5,30.0
std,185.969,26.705,6.261,12.356,1.12,17.066
min,1.0,6.0,18.0,0.0,1.0,1.0
25%,60.5,12.0,23.0,2.75,1.75,15.0
50%,232.5,22.0,28.0,4.0,2.5,30.0
75%,413.25,41.0,32.0,9.0,3.25,45.0
max,593.0,151.0,42.0,102.0,4.0,59.0



=== Categorical columns summary ===


Unnamed: 0,treatment
count,236
unique,2
top,Progabide
freq,124


## 5. Trial structure: subjects, periods, treatment

We verify:

- Number of unique subjects  
- How many periods each subject has  
- Subjects per treatment group  
- Whether `(subject, period)` is unique

In [6]:
# Unique subjects
if "subject" in df.columns:
    print("Unique subjects:", df["subject"].nunique())

# Treatment groups
if {"subject", "treatment"}.issubset(df.columns):
    print("\nSubjects per treatment group:")
    print(df.groupby("treatment")["subject"].nunique())

# Periods per subject
if {"subject", "period"}.issubset(df.columns):
    periods_per_subject = df.groupby("subject")["period"].nunique()
    print("\nPeriods per subject (summary):")
    print(periods_per_subject.describe())

# Uniqueness of (subject, period)
if {"subject", "period"}.issubset(df.columns):
    dup_pairs = df.duplicated(subset=["subject", "period"]).sum()
    if dup_pairs == 0:
        print("\nAll (subject, period) combinations are unique.")
    else:
        print(f"\nWarning: {dup_pairs} duplicated (subject, period) rows found.")

Unique subjects: 59

Subjects per treatment group:
treatment
Progabide    31
placebo      28
Name: subject, dtype: int64

Periods per subject (summary):
count    59.0
mean      4.0
std       0.0
min       4.0
25%       4.0
50%       4.0
75%       4.0
max       4.0
Name: period, dtype: float64

All (subject, period) combinations are unique.


  print(df.groupby("treatment")["subject"].nunique())


## 6. Consistency checks for enriched variables

We check that the enriched columns behave as expected:

1. `base + change ≈ seizure_rate`  
2. `pct_change ≈ (seizure_rate − base) / base` (for rows with `base != 0`)  
3. If `responder_50` exists, it should match `pct_change <= -0.5`

In [7]:
tolerance = 1e-6

# 1. base + change vs seizure_rate
if {"base", "change", "seizure_rate"}.issubset(df.columns):
    recon = df["base"] + df["change"]
    diff = (recon - df["seizure_rate"]).abs()
    max_diff = diff.max()
    print(f"Max |base + change - seizure_rate| = {max_diff:.6g}")
    if max_diff < tolerance:
        print("✓ base + change is consistent with seizure_rate within tolerance.")
    else:
        print("⚠ base + change differs from seizure_rate beyond tolerance.")

# 2. pct_change vs recomputed
if {"base", "seizure_rate", "pct_change"}.issubset(df.columns):
    mask = df["base"] != 0
    recomputed_pct = (df.loc[mask, "seizure_rate"] - df.loc[mask, "base"]) / df.loc[mask, "base"]
    diff_pct = (recomputed_pct - df.loc[mask, "pct_change"]).abs()
    max_diff_pct = diff_pct.max()
    print(f"\nMax |recomputed_pct - pct_change| (nonzero base) = {max_diff_pct:.6g}")
    if max_diff_pct < tolerance:
        print("✓ pct_change is consistent with base and seizure_rate within tolerance.")
    else:
        print("⚠ pct_change differs from recomputed values beyond tolerance.")

# 3. responder_50 vs pct_change <= -0.5
if {"pct_change", "responder_50"}.issubset(df.columns):
    expected_flag = df["pct_change"] <= -0.5
    mismatches = (expected_flag != df["responder_50"]).sum()
    print(f"\nResponder flag mismatches (pct_change <= -0.5 vs responder_50): {mismatches}")
    if mismatches == 0:
        print("✓ responder_50 matches pct_change <= -0.5 for all rows.")
    else:
        print("⚠ Some rows have responder_50 inconsistent with pct_change threshold.")

## 7. Save `epilepsy_enriched_clean.csv`

We now save a cleaned, standardized version of the enriched dataset.  


In [8]:
clean_path = "epilepsy_enriched_clean.csv"
df.to_csv(clean_path, index=False)
print(f"Final cleaned enriched dataset saved to {clean_path}")

Final cleaned enriched dataset saved to epilepsy_enriched_clean.csv


# Now our code begins in the EDA Folder