In [None]:
# tune_decision_tree.py
import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

In [None]:
# ---------------- CONFIG ----------------
BASE_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\13 decision tree\Decision Tree"
RAW_XLSX = os.path.join(BASE_PATH, "heart_disease.xlsx")
PROCESSED_CSV = os.path.join(BASE_PATH, "heart_processed.csv")

In [None]:
OUT_BEST_MODEL = os.path.join(BASE_PATH, "decision_tree_best.pkl")
OUT_CV_RESULTS = os.path.join(BASE_PATH, "gridsearch_cv_results.csv")
OUT_BEST_PARAMS = os.path.join(BASE_PATH, "best_params.txt")
OUT_REPORT = os.path.join(BASE_PATH, "decision_tree_tuned_report.txt")
OUT_FI_PNG = os.path.join(BASE_PATH, "feature_importances.png")
OUT_TREE_PNG = os.path.join(BASE_PATH, "decision_tree_best.png")
RANDOM_STATE = 42
TEST_SIZE = 0.2
# ----------------------------------------

In [None]:
def ensure_processed():
    """Create processed CSV if missing (light FE similar to earlier steps)."""
    if os.path.exists(PROCESSED_CSV):
        print("Found processed CSV:", PROCESSED_CSV)
        return pd.read_csv(PROCESSED_CSV)
    print("Processed CSV not found. Generating from raw Excel...")
    df = pd.read_excel(RAW_XLSX, sheet_name="Heart_disease")
    df['oldpeak'] = df['oldpeak'].fillna(df['oldpeak'].median())
    for col in ['trestbps', 'chol']:
        df[col] = df[col].replace(0, np.nan).fillna(df[col].median())
    df['sex'] = df['sex'].map({'Male':1, 'Female':0})
    for col in ['fbs','exang']:
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)
        else:
            df[col] = df[col].map({'True':1,'TURE':1,'False':0,'FALSE':0}).fillna(0).astype(int)
    df['target'] = df['num'].apply(lambda x: 1 if x>0 else 0)
    cat_cols = [c for c in ['cp','restecg','slope','thal'] if c in df.columns]
    df_processed = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    if 'num' in df_processed.columns:
        df_processed = df_processed.drop(columns=['num'])
    df_processed.to_csv(PROCESSED_CSV, index=False)
    print("Saved processed CSV to:", PROCESSED_CSV)
    return df_processed

In [None]:
def main():
    df = ensure_processed()
    if 'target' not in df.columns:
        raise RuntimeError("Processed data missing 'target' column.")
    X = df.drop(columns=['target'])
    y = df['target']

    # train/test split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )
    print("Train/test shapes:", X_train.shape, X_test.shape)

    # parameter grid (sane but not huge)
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 3, 5, 7, 9, 12],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'max_features': [None, 'sqrt', 'log2']
    }

    # Stratified CV
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    base_clf = DecisionTreeClassifier(random_state=RANDOM_STATE)

    grid = GridSearchCV(
        estimator=base_clf,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=cv,
        n_jobs=-1,
        verbose=1,
        return_train_score=True
    )

    print("Starting GridSearchCV... (this may take a few minutes)")
    grid.fit(X_train, y_train)
    print("GridSearchCV done.")

    # Save CV results
    cv_results = pd.DataFrame(grid.cv_results_)
    cv_results.to_csv(OUT_CV_RESULTS, index=False)
    print("Saved CV results to:", OUT_CV_RESULTS)

    # Best estimator & params
    best = grid.best_estimator_
    best_params = grid.best_params_
    best_score = grid.best_score_
    with open(OUT_BEST_PARAMS, "w") as f:
        f.write(f"Best ROC-AUC (CV): {best_score:.5f}\n")
        f.write("Best params:\n")
        for k,v in best_params.items():
            f.write(f"{k}: {v}\n")
    print("Saved best params to:", OUT_BEST_PARAMS)

    # Save best model
    joblib.dump(best, OUT_BEST_MODEL)
    print("Saved best model to:", OUT_BEST_MODEL)

    # Evaluate on test set
    y_pred = best.predict(X_test)
    y_proba = best.predict_proba(X_test)[:,1] if hasattr(best, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else float("nan")
    cm = confusion_matrix(y_test, y_pred)

    # Save evaluation report
    with open(OUT_REPORT, "w") as f:
        f.write("Decision Tree — Tuned Model Evaluation\n\n")
        f.write(f"Test shape: {X_test.shape}\n\n")
        f.write(f"Accuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1-score: {f1:.4f}\nROC-AUC: {roc_auc:.4f}\n\n")
        f.write("Confusion Matrix:\n")
        f.write(np.array2string(cm))
        f.write("\n\nClassification Report:\n")
        f.write(classification_report(y_test, y_pred, zero_division=0))
    print("Saved evaluation report to:", OUT_REPORT)

    # Feature importances plot
    fi = pd.Series(best.feature_importances_, index=X.columns).sort_values(ascending=False)
    plt.figure(figsize=(10,6))
    fi.head(20).plot(kind='bar')
    plt.title("Top 20 Feature Importances (Decision Tree - tuned)")
    plt.ylabel("Importance")
    plt.tight_layout()
    plt.savefig(OUT_FI_PNG)
    plt.close()
    print("Saved feature importances to:", OUT_FI_PNG)

    # Save tree visualization (may be large)
    try:
        plt.figure(figsize=(20,12))
        plot_tree(best, feature_names=X.columns, class_names=["No Disease","Disease"],
                  filled=True, rounded=True, fontsize=8)
        plt.tight_layout()
        plt.savefig(OUT_TREE_PNG)
        plt.close()
        print("Saved tree visualization to:", OUT_TREE_PNG)
    except Exception as e:
        print("Could not save tree visualization:", e)

    # Quick console summary
    print("\n=== Test set performance (tuned model) ===")
    print(f"Accuracy: {acc:.4f}  Precision: {prec:.4f}  Recall: {rec:.4f}  F1: {f1:.4f}  ROC-AUC: {roc_auc:.4f}")
    print("Done. All outputs in:", BASE_PATH)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# feature_engineering_heart.py
import pandas as pd
import numpy as np
import os

In [None]:
# === PATHS ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\13 decision tree\Decision Tree"
file_path = os.path.join(base_path, "heart_disease.xlsx")

In [None]:
# === LOAD DATA ===
df = pd.read_excel(file_path, sheet_name="Heart_disease")

In [None]:
# === HANDLE MISSING VALUES ===
# Replace missing oldpeak values with median
df['oldpeak'] = df['oldpeak'].fillna(df['oldpeak'].median())

In [None]:
# === FIX ANOMALIES ===
# Replace 0 in trestbps and chol with median (clinically impossible values)
for col in ['trestbps', 'chol']:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

In [None]:
# === CONVERT CATEGORICALS ===
# Sex → binary
df['sex'] = df['sex'].map({'Male':1, 'Female':0})

In [None]:
# Boolean columns (fbs, exang) → integers
for col in ['fbs','exang']:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)
    elif df[col].dtype == object:   # handle weird cases like 'TURE', 'FALSE'
        df[col] = df[col].map({'True':1, 'TURE':1, 'False':0, 'FALSE':0}).fillna(0).astype(int)

In [None]:
# Target → binary (disease present or not)
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# One-hot encode categorical features
categorical_cols = ['cp', 'restecg', 'slope', 'thal']
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
# Drop original "num"
df_processed = df_processed.drop(columns=['num'])

In [None]:
# === SAVE PROCESSED DATASET ===
processed_path = os.path.join(base_path, "heart_processed.csv")
df_processed.to_csv(processed_path, index=False)

In [None]:
print("Feature Engineering completed.")
print("Processed dataset saved to:", processed_path)
print("Final shape:", df_processed.shape)
print("Columns:", df_processed.columns.tolist()[:10], "...")  # preview

In [None]:
# model_evaluation_heart.py
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, roc_curve, auc
)
from sklearn.tree import plot_tree

In [None]:
# === PATHS ===
BASE_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\13 decision tree\Decision Tree"
PROCESSED_CSV = os.path.join(BASE_PATH, "heart_processed.csv")
MODEL_PATH = os.path.join(BASE_PATH, "decision_tree_best.pkl")

In [None]:
# === OUTPUT FILES ===
EVAL_TXT = os.path.join(BASE_PATH, "decision_tree_final_evaluation.txt")
CM_PNG = os.path.join(BASE_PATH, "confusion_matrix_final.png")
ROC_PNG = os.path.join(BASE_PATH, "roc_curve_final.png")
FI_PNG = os.path.join(BASE_PATH, "feature_importances_final.png")
TREE_PNG = os.path.join(BASE_PATH, "decision_tree_final.png")

In [None]:
# === LOAD MODEL & DATA ===
print("Loading model and data...")
df = pd.read_csv(PROCESSED_CSV)
model = joblib.load(MODEL_PATH)

In [None]:
# === PREPARE FEATURES ===
X = df.drop(columns=['target'])
y = df['target']

In [None]:
# === MAKE PREDICTIONS ===
y_pred = model.predict(X)
y_proba = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else None

In [None]:
# === METRICS ===
acc = accuracy_score(y, y_pred)
prec = precision_score(y, y_pred, zero_division=0)
rec = recall_score(y, y_pred, zero_division=0)
f1 = f1_score(y, y_pred, zero_division=0)
roc_auc = roc_auc_score(y, y_proba) if y_proba is not None else float("nan")

In [None]:
cm = confusion_matrix(y, y_pred)

In [None]:
# === SAVE METRICS REPORT ===
with open(EVAL_TXT, "w") as f:
    f.write("=== Decision Tree Final Evaluation ===\n\n")
    f.write(f"Accuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1-score: {f1:.4f}\nROC-AUC: {roc_auc:.4f}\n\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write("\n\nClassification Report:\n")
    f.write(classification_report(y, y_pred, zero_division=0))
print("Saved evaluation metrics to:", EVAL_TXT)

In [None]:
# === CONFUSION MATRIX PLOT ===
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix — Decision Tree")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig(CM_PNG)
plt.close()
print("Saved confusion matrix plot:", CM_PNG)

In [None]:
# === ROC CURVE ===
if y_proba is not None:
    fpr, tpr, _ = roc_curve(y, y_proba)
    roc_auc_val = auc(fpr, tpr)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2, label=f"ROC curve (area = {roc_auc_val:.3f})")
    plt.plot([0,1], [0,1], linestyle='--', lw=1, color='grey')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve — Decision Tree")
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(ROC_PNG)
    plt.close()
    print("Saved ROC curve:", ROC_PNG)

In [None]:
# === FEATURE IMPORTANCES ===
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=importances.head(15), y=importances.head(15).index)
plt.title("Top 15 Feature Importances — Decision Tree")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig(FI_PNG)
plt.close()
print("Saved feature importance chart:", FI_PNG)

In [None]:
# === DECISION TREE STRUCTURE ===
plt.figure(figsize=(22,12))
plot_tree(model, feature_names=X.columns, class_names=["No Disease", "Disease"],
          filled=True, rounded=True, fontsize=8)
plt.title("Decision Tree Structure — Final Model")
plt.tight_layout()
plt.savefig(TREE_PNG)
plt.close()
print("Saved decision tree visualization:", TREE_PNG)

In [None]:
# === SUMMARY IN CONSOLE ===
print("\n=== Model Performance Summary ===")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("\nFeature Importances (Top 10):")
print(importances.head(10))
print("\nAll evaluation files are saved in:", BASE_PATH)

In [None]:
# train_decision_tree.py
import os
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
)

In [None]:
# ----------------- CONFIG -----------------
BASE_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\13 decision tree\Decision Tree"
RAW_XLSX = os.path.join(BASE_PATH, "heart_disease.xlsx")
PROCESSED_CSV = os.path.join(BASE_PATH, "heart_processed.csv")

In [None]:
MODEL_OUT = os.path.join(BASE_PATH, "decision_tree_baseline.pkl")
REPORT_OUT = os.path.join(BASE_PATH, "decision_tree_evaluation.txt")
CM_PNG = os.path.join(BASE_PATH, "confusion_matrix_baseline.png")
ROC_PNG = os.path.join(BASE_PATH, "roc_curve_baseline.png")
TREE_PNG = os.path.join(BASE_PATH, "decision_tree_baseline.png")

In [None]:
RANDOM_STATE = 42
TEST_SIZE = 0.2
# ------------------------------------------

In [None]:
def ensure_processed():
    """If processed CSV missing, create it from raw Excel (simple FE)."""
    if os.path.exists(PROCESSED_CSV):
        print(f"Found processed file: {PROCESSED_CSV}")
        return pd.read_csv(PROCESSED_CSV)
    print("Processed CSV not found — creating from raw Excel (light feature engineering)...")
    df = pd.read_excel(RAW_XLSX, sheet_name="Heart_disease")
    # Basic fixes similar to previous step
    df['oldpeak'] = df['oldpeak'].fillna(df['oldpeak'].median())
    for col in ['trestbps', 'chol']:
        df[col] = df[col].replace(0, np.nan).fillna(df[col].median())
    df['sex'] = df['sex'].map({'Male':1, 'Female':0})
    for col in ['fbs','exang']:
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)
        else:
            df[col] = df[col].map({'True':1,'TURE':1,'False':0,'FALSE':0}).fillna(0).astype(int)
    df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
    cat_cols = [c for c in ['cp','restecg','slope','thal'] if c in df.columns]
    df_processed = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    if 'num' in df_processed.columns:
        df_processed = df_processed.drop(columns=['num'])
    df_processed.to_csv(PROCESSED_CSV, index=False)
    print("Saved processed CSV to:", PROCESSED_CSV)
    return df_processed

In [None]:
def train_and_evaluate(df):
    # prepare X and y
    if 'target' not in df.columns:
        raise ValueError("No 'target' column found in processed data.")
    X = df.drop(columns=['target'])
    y = df['target']

    # stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
    )

    print("Train/test sizes:", X_train.shape, X_test.shape)

    # baseline Decision Tree
    clf = DecisionTreeClassifier(random_state=RANDOM_STATE)
    clf.fit(X_train, y_train)

    # predictions & probs
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else None

    # metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else float("nan")

    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # save model
    joblib.dump(clf, MODEL_OUT)

    # write evaluation report
    with open(REPORT_OUT, "w") as f:
        f.write("Decision Tree — Baseline Evaluation\n")
        f.write(f"Train shape: {X_train.shape}\nTest shape: {X_test.shape}\n\n")
        f.write(f"Accuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1-score: {f1:.4f}\nROC-AUC: {roc_auc:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(classification_report(y_test, y_pred, zero_division=0))
    print("Saved model to:", MODEL_OUT)
    print("Saved evaluation report to:", REPORT_OUT)

    # plot confusion matrix
    plt.figure(figsize=(5,4))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ["No Disease", "Disease"], rotation=45)
    plt.yticks(tick_marks, ["No Disease", "Disease"])
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(CM_PNG)
    plt.close()
    print("Saved confusion matrix to:", CM_PNG)

    # ROC curve
    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc_val = auc(fpr, tpr)
        plt.figure(figsize=(6,5))
        plt.plot(fpr, tpr, lw=2, label=f'ROC curve (area = {roc_auc_val:.3f})')
        plt.plot([0,1], [0,1], linestyle='--', lw=1)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig(ROC_PNG)
        plt.close()
        print("Saved ROC curve to:", ROC_PNG)
    else:
        print("No probability estimates available; skipped ROC curve.")

    # Save a visual of the decision tree (may be large)
    try:
        plt.figure(figsize=(18,10))
        plot_tree(clf, feature_names=X.columns, class_names=["No","Yes"], filled=True, rounded=True, fontsize=8)
        plt.tight_layout()
        plt.savefig(TREE_PNG)
        plt.close()
        print("Saved decision tree visualization to:", TREE_PNG)
    except Exception as e:
        print("Could not save tree visualization:", e)

    # Print summary to console
    print("\n=== Metrics summary ===")
    print(f"Accuracy: {acc:.4f}  Precision: {prec:.4f}  Recall: {rec:.4f}  F1: {f1:.4f}  ROC-AUC: {roc_auc:.4f}")

In [None]:
if __name__ == "__main__":
    df_proc = ensure_processed()
    train_and_evaluate(df_proc)

In [None]:
# decision_tree_assignment.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

In [None]:
# === PATHS ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\13 decision tree\Decision Tree"
file_path = os.path.join(base_path, "heart_disease.xlsx")

In [None]:
# === LOAD DATA ===
df = pd.read_excel(file_path, sheet_name="Heart_disease")

In [None]:
# === CLEANING ===
# Convert boolean to int
for col in ['fbs', 'exang']:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)
    elif df[col].dtype == object:
        df[col] = df[col].map({'True':1, 'TURE':1, 'FALSE':0, 'False':0})  # fix odd strings

In [None]:
# Sex mapping
df['sex'] = df['sex'].map({'Male':1, 'Female':0})

In [None]:
# Target binary (num>0 → 1)
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# One-hot encode categorical vars
ohe_cols = ['cp','restecg','slope','thal']
df_processed = pd.get_dummies(df, columns=ohe_cols, drop_first=True)

In [None]:
# Drop original num column
df_processed = df_processed.drop(columns=['num'])

In [None]:
# === TRAIN/TEST SPLIT ===
X = df_processed.drop(columns=['target'])
y = df_processed['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# === BASELINE MODEL ===
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

In [None]:
# === EVALUATION ===
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1-score": f1_score(y_test, y_pred),
    "ROC-AUC": roc_auc_score(y_test, y_proba)
}

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
# === SAVE RESULTS ===
# Processed dataset
processed_path = os.path.join(base_path, "heart_processed.csv")
df_processed.to_csv(processed_path, index=False)

In [None]:
# Model
model_path = os.path.join(base_path, "decision_tree_model.pkl")
joblib.dump(clf, model_path)

In [None]:
# Metrics report
report_path = os.path.join(base_path, "decision_tree_report.txt")
with open(report_path, "w") as f:
    f.write("=== Decision Tree Evaluation ===\n")
    for k,v in metrics.items():
        f.write(f"{k}: {v:.4f}\n")
    f.write("\nClassification Report:\n")
    f.write(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix plot
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
cm_path = os.path.join(base_path, "confusion_matrix.png")
plt.savefig(cm_path)
plt.close()

In [None]:
# Decision Tree visualization
plt.figure(figsize=(20,10))
plot_tree(clf, feature_names=X.columns, class_names=["No Disease","Disease"],
          filled=True, rounded=True, fontsize=8)
tree_path = os.path.join(base_path, "decision_tree.png")
plt.savefig(tree_path)
plt.close()

In [None]:
print("All done! Files saved in:", base_path)
print("Processed CSV:", processed_path)
print("Model:", model_path)
print("Report:", report_path)
print("Plots:", cm_path, "and", tree_path)

In [None]:
# eda_heart_disease.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# === PATHS ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\13 decision tree\Decision Tree"
file_path = os.path.join(base_path, "heart_disease.xlsx")

In [None]:
# === LOAD DATA ===
df = pd.read_excel(file_path, sheet_name="Heart_disease")

In [None]:
# === BASIC INFO ===
print("Shape:", df.shape)
print("\n--- Info ---")
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())
print("\nDescriptive statistics:\n", df.describe().T)

In [None]:
# === TARGET DISTRIBUTION ===
print("\nTarget distribution:\n", df['num'].value_counts())

In [None]:
# === HISTOGRAMS ===
numeric_cols = ['age','trestbps','chol','thalch','oldpeak']
df[numeric_cols].hist(bins=20, figsize=(12,8))
plt.suptitle("Histograms of Numeric Features")
plt.savefig(os.path.join(base_path, "histograms.png"))
plt.close()

In [None]:
# === BOXPLOTS ===
plt.figure(figsize=(12,8))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(2,3,i)
    sns.boxplot(y=df[col])
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.savefig(os.path.join(base_path, "boxplots.png"))
plt.close()

In [None]:
# === CORRELATION MATRIX ===
corr = df[numeric_cols + ['num']].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix")
plt.savefig(os.path.join(base_path, "correlation_matrix.png"))
plt.close()

In [None]:
print("\nEDA completed. Plots saved in:", base_path)
print("- histograms.png")
print("- boxplots.png")
print("- correlation_matrix.png")

In [None]:
# feature_engineering_heart.py
import pandas as pd
import numpy as np
import os

In [None]:
# === PATHS ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\13 decision tree\Decision Tree"
file_path = os.path.join(base_path, "heart_disease.xlsx")

In [None]:
# === LOAD DATA ===
df = pd.read_excel(file_path, sheet_name="Heart_disease")

In [None]:
# === HANDLE MISSING VALUES ===
# Replace missing oldpeak values with median
df['oldpeak'] = df['oldpeak'].fillna(df['oldpeak'].median())

In [None]:
# === FIX ANOMALIES ===
# Replace 0 in trestbps and chol with median (clinically impossible values)
for col in ['trestbps', 'chol']:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

In [None]:
# === CONVERT CATEGORICALS ===
# Sex → binary
df['sex'] = df['sex'].map({'Male':1, 'Female':0})

In [None]:
# Boolean columns (fbs, exang) → integers
for col in ['fbs','exang']:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)
    elif df[col].dtype == object:   # handle weird cases like 'TURE', 'FALSE'
        df[col] = df[col].map({'True':1, 'TURE':1, 'False':0, 'FALSE':0}).fillna(0).astype(int)

In [None]:
# Target → binary (disease present or not)
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# One-hot encode categorical features
categorical_cols = ['cp', 'restecg', 'slope', 'thal']
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
# Drop original "num"
df_processed = df_processed.drop(columns=['num'])

In [None]:
# === SAVE PROCESSED DATASET ===
processed_path = os.path.join(base_path, "heart_processed.csv")
df_processed.to_csv(processed_path, index=False)

In [None]:
print("Feature Engineering completed.")
print("Processed dataset saved to:", processed_path)
print("Final shape:", df_processed.shape)
print("Columns:", df_processed.columns.tolist()[:10], "...")  # preview