In [None]:
import numpy as np
import pandas as pd

In [None]:
data=pd.read_csv('/content/sample_data/stroke_prediction_dataset.csv')
data.head()

In [None]:
# Dropping some irrelevant columns
data = data.drop(["Patient ID", "Patient Name", "Marital Status","Work Type","Residence Type","Dietary Habits","Symptoms"], axis =1)

In [None]:
# process diagnosis column
data['Diagnosis'].replace(['Stroke', 'No Stroke'],[0,1],inplace=True)

In [None]:
# process Gender column
data['Gender'].replace(['Male','Female'],[0,1],inplace=True)

In [None]:
# process Smoking status column
data['Smoking Status'].replace(['Non-smoker', 'Formerly Smoked', 'Currently Smokes'],[0,1,2],inplace=True)

In [None]:
# process Alcohol intake column
data['Alcohol Intake'].replace([ 'Never', 'Rarely','Social Drinker','Frequent Drinker'],[0,1,2,3],inplace=True)

In [None]:
# process Physical activity column
data['Physical Activity'].replace([ 'Low', 'Moderate','High'],[0,1,2],inplace=True)

In [None]:
# process Family history column
data['Family History of Stroke'].replace(['Yes', 'No'],[0,1],inplace=True)

In [None]:
# process Blood pressure level column
data['Systolic BP'] = data['Blood Pressure Levels'].apply(lambda x: int(x.split('/')[0]))
data['Diastolic BP'] = data['Blood Pressure Levels'].apply(lambda x: int(x.split('/')[1]))
data.drop('Blood Pressure Levels', axis=1, inplace=True)
# Min-Max Scaling
data['Systolic BP'] = (data['Systolic BP'] - data['Systolic BP'].min()) / (data['Systolic BP'].max() - data['Systolic BP'].min())
data['Diastolic BP'] = (data['Diastolic BP'] - data['Diastolic BP'].min()) / (data['Diastolic BP'].max() - data['Diastolic BP'].min())


In [None]:
# process Age column
data.insert(2,'age_band', np.zeros)
data.loc[data['Age']<=20,'age_band']=0
data.loc[(data['Age']>20)&(data['Age']<=32),'age_band']=1
data.loc[(data['Age']>32)&(data['Age']<=48),'age_band']=2
data.loc[(data['Age']>48)&(data['Age']<=64),'age_band']=3
data.loc[data['Age']>64,'age_band']=4
data.drop(columns= "Age", inplace=True)
data["age_band"]=data['age_band'].astype(str).astype(int)

In [None]:
# apply normalization techniques on Columns
column = 'Average Glucose Level'
column2 = 'Body Mass Index (BMI)'
column3 ='Stress Levels'
data[column] = data[column] /data[column].abs().max()
data[column2] = data[column2] /data[column2].abs().max()
data[column3] = data[column3] /data[column3].abs().max()

# view normalized data
# display(data[column], data[column2],data[column3])

In [None]:
# process Cholesterol column
data['HDL'] = data['Cholesterol Levels'].apply(lambda x: int(x.split(', ')[0].split(': ')[1]))
data['LDL'] = data['Cholesterol Levels'].apply(lambda x: int(x.split(', ')[1].split(': ')[1]))

# Drop the original 'Cholesterol Levels' column if needed
data.drop('Cholesterol Levels', axis=1, inplace=True)
# Min-Max Scaling
data['HDL'] = (data['HDL'] - data['HDL'].min()) / (data['HDL'].max() - data['HDL'].min())
data['LDL'] = (data['LDL'] - data['LDL'].min()) / (data['LDL'].max() - data['LDL'].min())


In [None]:
data.head()

Unnamed: 0,Gender,age_band,Hypertension,Heart Disease,Average Glucose Level,Body Mass Index (BMI),Smoking Status,Alcohol Intake,Physical Activity,Stroke History,Family History of Stroke,Stress Levels,Diagnosis,Systolic BP,Diastolic BP,HDL,LDL
0,0,3,0,1,0.65455,0.55925,0,2,1,0,0,0.348,0,0.555556,0.96,0.76,0.561538
1,0,4,0,0,0.91865,0.81425,0,0,0,0,1,0.173,0,0.622222,0.62,0.66,0.076923
2,0,1,1,1,0.945,0.508,1,1,2,0,0,0.731,0,0.711111,0.74,0.58,0.269231
3,0,4,0,0,0.92645,0.6875,0,3,1,0,1,0.535,1,0.933333,0.42,0.8,0.592308
4,0,3,1,1,0.8867,0.7265,2,1,0,0,0,0.684,0,0.344444,0.7,0.7,0.061538


# XGBoost Starts

In [None]:
!pip install category_encoders scikit-optimize



In [None]:
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, roc_auc_score, f1_score
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

Step #1: process the dataset

In [None]:
df = data
X = df.drop(columns='Diagnosis')
y = df['Diagnosis']

Step #2: Build a pipeline for training

In [None]:
estimators = [
    ('encoder', TargetEncoder()),
    ('clf', XGBClassifier(random_state=8)) # can customize objective function with the objective parameter
]
pipe = Pipeline(steps=estimators)

Step #3: Set up hyperparameter tuning

In [None]:
search_space = {
    'clf__max_depth': Integer(2,8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

model = BayesSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=8), n_iter=10, scoring='roc_auc', random_state=8)

Step #4: Train the XGBoost model

In [None]:
# Fit the model using cross-validation
y_pred_cv = cross_val_predict(model, X, y, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=8), method='predict_proba')

# Convert probabilities to binary predictions
y_pred_binary = np.argmax(y_pred_cv, axis=1)

# Compute confusion matrix for each fold
confusion_matrices = []
for train_index, test_index in StratifiedKFold(n_splits=10, shuffle=True, random_state=8).split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train_fold, y_train_fold)
    y_pred_fold = model.predict(X_test_fold)

    confusion_matrices.append(confusion_matrix(y_test_fold, y_pred_fold))

# Merge confusion matrices
merged_confusion_matrix = np.sum(confusion_matrices, axis=0)

# Print or visualize the merged confusion matrix
print("Merged Confusion Matrix:")
print(merged_confusion_matrix)

Merged Confusion Matrix:
[[2436 5032]
 [2490 5042]]


Step #5: Evaluate the model

In [None]:
print(merged_confusion_matrix)
# Calculate true positives, false positives, false negatives
true_positives = merged_confusion_matrix[1, 1]
false_positives = merged_confusion_matrix[0, 1]
false_negatives = merged_confusion_matrix[1, 0]

# Calculate accuracy
accuracy = (true_positives + merged_confusion_matrix[0, 0]) / np.sum(merged_confusion_matrix)

# Calculate precision
precision = true_positives / (true_positives + false_positives)

# Calculate recall
recall = true_positives / (true_positives + false_negatives)

# Calculate F1 score
f1 = 2 * (precision * recall) / (precision + recall)

# Calculate specificity
specificity = merged_confusion_matrix[0, 0] / (merged_confusion_matrix[0, 0] + merged_confusion_matrix[0, 1])

# Calculate false positive rate
fpr = false_positives / (merged_confusion_matrix[0, 0] + merged_confusion_matrix[0, 1])

# Assuming y_true is the true labels and y_pred_prob is the predicted probabilities for positive class
y_true = np.concatenate([y.iloc[test_index].values for _, test_index in StratifiedKFold(n_splits=10, shuffle=True, random_state=8).split(X, y)])
y_pred_prob = np.concatenate([model.predict_proba(X.iloc[test_index])[:, 1] for _, test_index in StratifiedKFold(n_splits=10, shuffle=True, random_state=8).split(X, y)])

# Calculate ROC AUC
roc_auc = roc_auc_score(y_true, y_pred_prob)

# Print or use the calculated metrics as needed
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Specificity:", specificity)
print("False Positive Rate:", fpr)
print("ROC AUC:", roc_auc)


[[2436 5032]
 [2490 5042]]
Accuracy: 0.49853333333333333
Precision: 0.5004963271788763
Recall: 0.6694105151354222
F1 Score: 0.5727592866068385
Specificity: 0.32619175147295126
False Positive Rate: 0.6738082485270488
ROC AUC: 0.5237049648690493
