In [19]:
# %%
import os, sys, warnings, logging, random, joblib
import numpy as np, pandas as pd
from pathlib import Path

warnings.filterwarnings("ignore")

# Reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [20]:

# Paths
ROOT   = Path.cwd().parent if Path.cwd().name.lower() == "notebook" else Path.cwd()
RAW    = ROOT / "data" / "raw"
PROC   = ROOT / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

# Basic logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)


In [21]:
train_path  = RAW / "Training.csv"
test_path   = RAW / "Testing.csv"

assert train_path.exists(), f"Training file not found: {train_path}"
assert test_path.exists(),  f"Testing  file not found: {test_path}"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Drop unnamed extra index columns that may appear if CSV saved with indices
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

logging.info(f"Train shape : {train_df.shape}")
logging.info(f"Test  shape : {test_df.shape}")


2025-08-27 21:47:35,732 | INFO | Train shape : (4920, 133)
2025-08-27 21:47:35,737 | INFO | Test  shape : (42, 133)


In [22]:
EXPECTED_COLS = 133  # 132 symptoms + 1 prognosis

logging.info(f"Actual train_df columns count: {train_df.shape[1]}")

assert train_df.shape[1] == EXPECTED_COLS, f"Unexpected columns count: {train_df.shape[1]}"

summary = (
    train_df.describe(include="all")
            .T[["count", "unique"]]
            .assign(
                pct_missing=lambda d: 100 - (d["count"] / len(train_df) * 100)
            )
)

display(summary.head())
logging.info(f"Any NA in train_df? {train_df.isna().values.any()}")

# Fill missing symptom values with 0 (assume missing means symptom absent)
if train_df.isna().values.any():
    train_df.fillna(0, inplace=True)
    logging.info("Filled missing values with 0 in train_df.")

if test_df.isna().values.any():
    test_df.fillna(0, inplace=True)
    logging.info("Filled missing values with 0 in test_df.")

2025-08-27 21:47:35,786 | INFO | Actual train_df columns count: 133


Unnamed: 0,count,unique,pct_missing
itching,4920.0,,0.0
skin_rash,4920.0,,0.0
nodal_skin_eruptions,4920.0,,0.0
continuous_sneezing,4920.0,,0.0
shivering,4920.0,,0.0


2025-08-27 21:47:36,883 | INFO | Any NA in train_df? False


In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df["prognosis"] = le.fit_transform(train_df["prognosis"])
test_df["prognosis"]  = le.transform(test_df["prognosis"])

joblib.dump(le, PROC / "label_encoder.pkl")

logging.info(f"Encoded target classes: {len(le.classes_)}")


2025-08-27 21:47:36,963 | INFO | Encoded target classes: 41


In [24]:
# Convert symptom strings to 0/1 integer if necessary
symptom_cols = train_df.columns.difference(["prognosis"])

for col in symptom_cols:
    if train_df[col].dtype == object:
        train_df[col] = train_df[col].replace({
            "present": 1, "absent": 0, "yes": 1, "no": 0
        }).astype(int)
        test_df[col] = test_df[col].replace({
            "present": 1, "absent": 0, "yes": 1, "no": 0
        }).astype(int)

# Symptom count feature
train_df["symptom_count"] = train_df[symptom_cols].sum(axis=1)
test_df["symptom_count"] = test_df[symptom_cols].sum(axis=1)

# Rare symptom flag (symptoms with <1% frequency)
symptom_freq = train_df[symptom_cols].mean()
rare_symptoms = symptom_freq[symptom_freq < 0.01].index.tolist()

train_df["rare_symptom_flag"] = train_df[rare_symptoms].any(axis=1).astype(int)
test_df["rare_symptom_flag"] = test_df[rare_symptoms].any(axis=1).astype(int)

In [25]:
# Interaction term example: fever AND cough if available
if {"fever", "cough"}.issubset(symptom_cols):
    train_df["fever_cough"] = train_df["fever"] & train_df["cough"]
    test_df["fever_cough"] = test_df["fever"] & test_df["cough"]

logging.info(f"Feature engineering done. Train shape now: {train_df.shape}")

# %% [markdown]
# # 6. Train/Validation Split

# %%
from sklearn.model_selection import train_test_split

X = train_df.drop("prognosis", axis=1)
y = train_df["prognosis"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

logging.info(f"Train size: {X_train.shape}, Valid size: {X_valid.shape}")


2025-08-27 21:47:37,143 | INFO | Feature engineering done. Train shape now: (4920, 135)
2025-08-27 21:47:37,198 | INFO | Train size: (3936, 134), Valid size: (984, 134)


In [26]:
X_train.to_csv(PROC / "X_train.csv", index=False)
X_valid.to_csv(PROC / "X_valid.csv", index=False)
y_train.to_csv(PROC / "y_train.csv", index=False)
y_valid.to_csv(PROC / "y_valid.csv", index=False)
test_df.to_csv(PROC / "test_processed.csv", index=False)

logging.info("Processed data saved to data/processed/ directory")

2025-08-27 21:47:37,639 | INFO | Processed data saved to data/processed/ directory
