In [None]:
#Upload the heartAttack_dirty CSV from my drive
from google.colab import drive
drive.mount('/content/drive')

# Path to your dataset inside Drive
# Example: if file is in "My Drive/datasets/heartAttack_dirty.csv"
RAW_FILE = "/content/drive/MyDrive/heartAttack_dirty(2).csv"
RAW_FILE


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/heartAttack_dirty(2).csv'

In [None]:
#read & quick peek

import pandas as pd

df = pd.read_csv(RAW_FILE)
print(df.shape)
df.head()


(120259, 28)


Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,60.0,Male,Rural,Middle,0.0,1.0,,0.0,83.0,0.0,...,62.0,173.0,48.0,121.0,101.0,Normal,0.0,0.0,0.0,0.0
1,53.0,Female,urban,Low,0.0,0.0,208.0,0.0,106.0,1.0,...,76.0,70.0,58.0,83.0,138.0,Normal,1.0,0.0,1.0,0.0
2,62.0,Female,Urban,Low,0.0,0.0,231.0,1.0,112.0,1.0,...,74.0,118.0,69.0,130.0,171.0,Abnormal,0.0,1.0,0.0,1.0
3,73.0,Male,Urban,Low,1.0,0.0,202.0,0.0,82.0,1.0,...,65.0,98.0,52.0,85.0,146.0,Normal,0.0,1.0,1.0,0.0
4,52.0,Male,Urban,Middle,0.0,0.0,231.0,1.0,81.0,1.0,...,71.0,129.0,34.0,148.0,191.0,Normal,0.0,1.0,1.0,0.0


In [None]:
#Helper functions (text cleaning, range checks, casing)

import numpy as np
import pandas as pd

def strip_normalize_text(s):
    if pd.isna(s):
        return s
    # remove leading/trailing spaces and collapse multiple spaces
    return " ".join(str(s).split())

def clip_or_nan(series, lo=None, hi=None):
    s = series.copy()
    invalid = pd.Series(False, index=s.index)
    if lo is not None:
        invalid |= s < lo
    if hi is not None:
        invalid |= s > hi
    s[invalid] = np.nan
    return s

def standardize_case(df, col, mode="title"):
    if col not in df.columns:
        return df
    if mode == "title":
        df[col] = df[col].apply(lambda x: x.title() if isinstance(x, str) else x)
    return df


In [None]:
#Config: domain rules (valid ranges)

# Valid physiological ranges used to flag impossible values as missing (NaN)
RANGE_RULES = {
    "age": (0, 100),
    "sleep_hours": (0, 24),
    "blood_pressure_systolic": (50, 300),
    "blood_pressure_diastolic": (20, 200),
    "cholesterol_level": (50, 500),
    "cholesterol_hdl": (10, 150),
    "cholesterol_ldl": (0, 400),
    "triglycerides": (0, 1500),
    "waist_circumference": (30, 300),
    "fasting_blood_sugar": (40, 600),
}


In [None]:
#Remove duplicates + normalize all text columns

# Remove exact duplicates
before_rows = len(df)
df = df.drop_duplicates().reset_index(drop=True)
removed_dups = before_rows - len(df)
print("Duplicates removed:", removed_dups)

# Normalize whitespace/casing for object columns
obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
for c in obj_cols:
    df[c] = df[c].apply(strip_normalize_text)

# Standardize case for key categoricals
for col in ["gender", "region", "smoking_status", "dietary_habits", "EKG_results"]:
    if col in df.columns:
        df = standardize_case(df, col, "title")


Duplicates removed: 3


In [None]:
#Alcohol rule: keep literal ‘None’ as a real category (not missing)

if "alcohol_consumption" in df.columns:
    df["alcohol_consumption"] = df["alcohol_consumption"].apply(strip_normalize_text)
    df["alcohol_consumption"] = df["alcohol_consumption"].apply(lambda x: x.title() if isinstance(x, str) else x)

    canon_map = {
        "None": "None",
        "No Alcohol": "None",
        "Nil": "None",
        "Non-Drinker": "None",
        "Occasional": "Occasional",
        "Moderate": "Moderate",
        "Heavy": "Heavy"
    }
    df["alcohol_consumption"] = df["alcohol_consumption"].replace(canon_map)

    # 👇 NEW LINE: make sure real "None" stays a valid string, not NaN
    df["alcohol_consumption"] = df["alcohol_consumption"].fillna("None")



In [None]:
#Convert negatives to NaN + apply range rules

# Replace ALL negative numeric values with NaN
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
neg_counts = {}
for c in num_cols:
    neg_mask = df[c] < 0
    if neg_mask.any():
        neg_counts[c] = int(neg_mask.sum())
        df.loc[neg_mask, c] = np.nan
print("Negative values set to NaN:", neg_counts)

# Clip by domain ranges: values outside range -> NaN
range_out_counts = {}
for c, (lo, hi) in RANGE_RULES.items():
    if c in df.columns:
        before_nan = df[c].isna().sum()
        df[c] = clip_or_nan(df[c], lo, hi)
        after_nan = df[c].isna().sum()
        range_out_counts[c] = int(after_nan - before_nan)
print("Out-of-range set to NaN:", range_out_counts)


Negative values set to NaN: {'cholesterol_level': 371, 'cholesterol_ldl': 13}
Out-of-range set to NaN: {'age': 604, 'sleep_hours': 373, 'blood_pressure_systolic': 1, 'blood_pressure_diastolic': 1, 'cholesterol_level': 1, 'cholesterol_hdl': 2, 'cholesterol_ldl': 0, 'triglycerides': 0, 'waist_circumference': 6, 'fasting_blood_sugar': 1}


In [None]:
#Impute missing values (categorical → ‘Unknown’ / numeric → median)

# Categorical imputation
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
cat_imputed = {}
for c in cat_cols:
    miss = int(df[c].isna().sum())
    if miss > 0:
        if c in {"smoking_status", "alcohol_consumption", "region", "dietary_habits"}:
            df[c] = df[c].fillna("Unknown")
        else:
            df[c] = df[c].fillna(df[c].mode(dropna=True)[0])
        cat_imputed[c] = miss

# Numeric imputation (median)
num_imputed = {}
for c in num_cols:
    miss = int(df[c].isna().sum())
    if miss > 0:
        df[c] = df[c].fillna(df[c].median())
        num_imputed[c] = miss

print("Categorical imputed:", cat_imputed)
print("Numeric imputed (median):", num_imputed)


Categorical imputed: {'gender': 1, 'region': 1, 'income_level': 1, 'smoking_status': 9619, 'physical_activity': 1, 'dietary_habits': 1, 'air_pollution_exposure': 1, 'stress_level': 1, 'EKG_results': 1}
Numeric imputed (median): {'age': 605, 'hypertension': 1, 'diabetes': 1, 'cholesterol_level': 10067, 'obesity': 1, 'waist_circumference': 7, 'family_history': 1, 'sleep_hours': 9924, 'blood_pressure_systolic': 2, 'blood_pressure_diastolic': 2, 'fasting_blood_sugar': 2, 'cholesterol_hdl': 3, 'cholesterol_ldl': 14, 'triglycerides': 1, 'previous_heart_disease': 1, 'medication_usage': 1, 'participated_in_free_screening': 1, 'heart_attack': 1}


In [None]:
#Final type tidying (binary flags to int, set categories)

# Cast binary-like numerics to int 0/1
for c in num_cols:
    uniq = set(df[c].dropna().unique())
    if len(uniq) <= 2 and uniq.issubset({0, 1, 0.0, 1.0}):
        df[c] = df[c].astype(int)

# Make key text columns categorical (saves memory, clarifies intent)
for c in ["gender", "region", "smoking_status", "dietary_habits", "EKG_results", "alcohol_consumption"]:
    if c in df.columns:
        df[c] = df[c].astype("category")

print("Dtypes set. Done.")


Dtypes set. Done.


In [None]:
#Save, download, and quick sanity check

from google.colab import files

CLEAN_FILE = "heartAttack_clean.csv"
df.to_csv(CLEAN_FILE, index=False)
print("Saved:", CLEAN_FILE, "with shape:", df.shape)

# All columns should now have 0 missing after imputation
missing_summary = df.isna().sum().sort_values(ascending=False).head(10)
print("Top missing counts (should be 0):")
print(missing_summary)

# Auto-download
files.download(CLEAN_FILE)


Saved: heartAttack_clean.csv with shape: (120256, 28)
Top missing counts (should be 0):
age                    0
gender                 0
region                 0
income_level           0
hypertension           0
diabetes               0
cholesterol_level      0
obesity                0
waist_circumference    0
family_history         0
dtype: int64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Note:** Replace `YOUR_GITHUB_REPO_URL` with the URL of your GitHub repository and `YOUR_COMMIT_MESSAGE` with a brief description of your changes.