In [None]:
# DECISION TREES 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
file_path = 'Resources/Stroke_dataset.csv'
df_stroke_cleaned = pd.read_csv(file_path)

# Step 2: Handle missing values for 'bmi' using mean imputation
imputer = SimpleImputer(strategy='mean')
df_stroke_cleaned['bmi'] = imputer.fit_transform(df_stroke_cleaned[['bmi']])

# Step 3: Encode categorical variables
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
encoders = {col: LabelEncoder() for col in categorical_cols}
for col, encoder in encoders.items():
    df_stroke_cleaned[col] = encoder.fit_transform(df_stroke_cleaned[col])

# Step 4: Define features and target variable
X = df_stroke_cleaned.drop(columns=['id', 'stroke'])  # Exclude 'id' column
y = df_stroke_cleaned['stroke']

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# Step 6: Train a Decision Tree Classifier
clf = DecisionTreeClassifier(max_depth=5, random_state=42)  # Limit depth to avoid overfitting
clf.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1458
           1       0.20      0.01      0.03        75

    accuracy                           0.95      1533
   macro avg       0.58      0.51      0.50      1533
weighted avg       0.91      0.95      0.93      1533

