In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [51]:
df = pd.read_csv('stroke_data.csv')
# print(df.head())

In [52]:
df.isnull().sum()

sex                  3
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [53]:
print(df['sex'].unique())

[ 1.  0. nan]


In [54]:
df = df.dropna(subset=['sex'])

In [55]:
df.isnull().sum()

sex                  0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [56]:
df.value_counts('stroke')

stroke
1    20460
0    20447
Name: count, dtype: int64

In [57]:
print("BEFORE REMOVING NEGATIVE AGE : \n", df['age'].count())
negativeAge = df[df['age']<0]

print("\n\nNEGATIVE AGE VALUEsS : ")
print(negativeAge['age'].count())

df = df[df['age']>=0]
print("\n\nAFTER REMOVING NEGATIVE AGE :\n",df['age'].count())

BEFORE REMOVING NEGATIVE AGE : 
 40907


NEGATIVE AGE VALUEsS : 
58


AFTER REMOVING NEGATIVE AGE :
 40849


In [58]:
X = df.drop("stroke", axis = 1)
Y = df["stroke"]

In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [60]:
#Standardize the data, it'll help PCA to work better
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [61]:
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, Y_train)

# Predict on the test data
y_pred_xgb = xgb_model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [62]:
# Accuracy
accuracy = accuracy_score(Y_test, y_pred_xgb)
print(f"Accuracy: {accuracy:.4f}")

# Classification report (Precision, Recall, F1-Score)
print("Classification Report:")
print(classification_report(Y_test, y_pred_xgb))

Accuracy: 0.5665
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.98      0.69      6121
           1       0.89      0.15      0.26      6134

    accuracy                           0.57     12255
   macro avg       0.71      0.57      0.48     12255
weighted avg       0.71      0.57      0.48     12255



In [63]:
# Calculate AUC-ROC score
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class
auc_xgb = roc_auc_score(Y_test, y_prob_xgb)
print(f"AUC-ROC Score: {auc_xgb:.4f}")

AUC-ROC Score: 0.7616
