In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Function
def preprocess_data(X, categorical_features=[], numerical_features=[]):
    """
    Preprocesses the dataset by imputing missing values, encoding categorical features,
    and scaling numerical features. Returns a preprocessed pipeline.
    """
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    return preprocessor

# Dataset Preparation Function
def prepare_data(X, y, test_size=0.2, random_state=42, categorical_features=[], numerical_features=[]):
    """
    Prepares the dataset by splitting, preprocessing, and scaling.
    Returns preprocessed train/test splits.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    preprocessor = preprocess_data(X, categorical_features, numerical_features)

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    return X_train, X_test, y_train, y_test

# Decision Tree Model Class
class DecisionTreeModel:
    def __init__(self, max_depth=None):
        self.model = DecisionTreeClassifier(max_depth=max_depth)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

    def evaluate(self, y_test, y_pred):
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        return report, cm

    def plot_confusion_matrix(self, cm):
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.show()

# Usage Example Function
def run_decision_tree_model(X, y, categorical_features=[], numerical_features=[], max_depth=None):
    X_train, X_test, y_train, y_test = prepare_data(X, y, categorical_features=categorical_features, numerical_features=numerical_features)

    dt_model = DecisionTreeModel(max_depth=max_depth)
    dt_model.train(X_train, y_train)

    y_pred = dt_model.predict(X_test)

    report, cm = dt_model.evaluate(y_test, y_pred)
    print("Classification Report:\n", report)

    dt_model.plot_confusion_matrix(cm)

# Sample usage with custom dataset
# df = pd.read_csv('your_dataset.csv')
# X = df.drop('target_column', axis=1)
# y = df['target_column']
# categorical_features = ['col1', 'col2']
# numerical_features = ['col3', 'col4']
# run_decision_tree_model(X, y, categorical_features, numerical_features, max_depth=5)
