# Decision Tree Classifier

In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [8]:
# Load and preprocess data
from sklearn.model_selection import KFold


health_data = pd.read_csv("../data/data_train.csv", delimiter=",", index_col="Id")
health_data = health_data.drop_duplicates()
selected_features = ['HighBP', 'HighChol', 'BMI',
       'HeartDiseaseorAttack', 'PhysActivity', 'GenHlth', 
       'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income',
       'ExtraMedTest', 'ExtraAlcoholTest']
X = health_data[selected_features].copy()
y = health_data.Status
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# Define the pipeline with polynomial feature expansion
pipeline = Pipeline([
    # ('scaler', MinMaxScaler()),
    ('classifier', DecisionTreeClassifier())
])

# Define the parameter grid
param_grid = {
    'classifier__max_depth': [None, 5, 8, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_leaf_nodes': [None, 10, 50, 100],
}

# Create GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1, scoring='f1')

# Train the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model on the validation set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_valid)
f1 = f1_score(y_valid, y_pred)
print("F1 Score:", f1)

# Best parameters: {'classifier__max_depth': 10, 'classifier__max_leaf_nodes': 50, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
# F1 Score: 0.8695585764472278

Best parameters: {'classifier__max_depth': 10, 'classifier__max_leaf_nodes': 50, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
F1 Score: 0.8695585764472278
