In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("../datasets/healthcare-dataset-stroke-data.csv")

# Display first 5 rows
print("First 5 rows:")
print(df.head())

# Basic info
print("\nDataset Info:")
print(df.info())

# Check missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check duplicates
print("\nNumber of Duplicates:", df.duplicated().sum())

# Class distribution
print("\nClass Distribution (stroke column):")
print(df['stroke'].value_counts(normalize=True))


First 5 rows:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  

Dataset Info:
<class 'pandas.core.frame.D

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset again
df = pd.read_csv("../datasets/healthcare-dataset-stroke-data.csv")

# 1. Handle Missing Values
# Fill missing BMI values with median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

# 2. Remove Duplicates
df = df.drop_duplicates()

# 3. Encode Categorical Variables
cat_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# 4. Normalize/Scale Numerical Values
scaler = StandardScaler()
num_cols = ['age', 'avg_glucose_level', 'bmi']

df[num_cols] = scaler.fit_transform(df[num_cols])

# Final check
print("After Cleaning & Preprocessing:")
print(df.head())
print("\nDataset shape:", df.shape)
print("\nMissing values:", df.isnull().sum().sum())


After Cleaning & Preprocessing:
      id  gender       age  hypertension  heart_disease  ever_married  \
0   9046       1  1.051434             0              1             1   
1  51676       0  0.786070             0              0             1   
2  31112       1  1.626390             0              1             1   
3  60182       0  0.255342             0              0             1   
4   1665       0  1.582163             1              0             1   

   work_type  Residence_type  avg_glucose_level       bmi  smoking_status  \
0          2               1           2.706375  1.005086               1   
1          3               0           2.121559 -0.098981               2   
2          2               0          -0.005028  0.472536               2   
3          2               1           1.437358  0.719327               3   
4          3               0           1.501184 -0.631531               2   

   stroke  
0       1  
1       1  
2       1  
3       1  
4     

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 1. Separate features and target
X = df.drop("stroke", axis=1)   # Features
y = df["stroke"]                # Target

# 2. Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Train SVM Model
svm_model = SVC(kernel="rbf", random_state=42)  # RBF kernel (default)
svm_model.fit(X_train, y_train)

# 4. Predictions
y_pred = svm_model.predict(X_test)

# 5. Evaluation
print("🔹 SVM Model Performance")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


🔹 SVM Model Performance
Accuracy: 0.9510763209393346

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Dictionary to store model results
results = {}

# 1. Support Vector Machine
svm_model = SVC(kernel="rbf", random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
results["SVM (RBF)"] = accuracy_score(y_test, y_pred_svm)

# Different Types of Decision Trees

# Decision Tree - Gini
dt_gini = DecisionTreeClassifier(criterion="gini", random_state=42)
dt_gini.fit(X_train, y_train)
y_pred_gini = dt_gini.predict(X_test)
results["Decision Tree (Gini)"] = accuracy_score(y_test, y_pred_gini)

# Decision Tree - Entropy
dt_entropy = DecisionTreeClassifier(criterion="entropy", random_state=42)
dt_entropy.fit(X_train, y_train)
y_pred_entropy = dt_entropy.predict(X_test)
results["Decision Tree (Entropy)"] = accuracy_score(y_test, y_pred_entropy)

# Decision Tree - Log Loss
dt_log = DecisionTreeClassifier(criterion="log_loss", random_state=42)
dt_log.fit(X_train, y_train)
y_pred_log = dt_log.predict(X_test)
results["Decision Tree (Log Loss)"] = accuracy_score(y_test, y_pred_log)

# Tree-Based Ensemble Models

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
results["Random Forest"] = accuracy_score(y_test, y_pred_rf)

# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
results["Gradient Boosting"] = accuracy_score(y_test, y_pred_gb)

# ------------------- Convert to Table -------------------
comparison_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
print(comparison_df)


                      Model  Accuracy
0                 SVM (RBF)  0.951076
1      Decision Tree (Gini)  0.910959
2   Decision Tree (Entropy)  0.913894
3  Decision Tree (Log Loss)  0.913894
4             Random Forest  0.948141
5         Gradient Boosting  0.947162
