In [78]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error,mean_squared_error,precision_score,confusion_matrix,accuracy_score
from sklearn.feature_selection import mutual_info_classif

In [79]:
df=pd.read_csv("healthcare-dataset-stroke-data_cleaned.csv",index_col=False)

df.head()


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [80]:
X=df.drop(columns="stroke")
y=df['stroke']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)



In [81]:
numericalCols=[]
categoricalCols=[]

for col in X.columns:
    if X[col].dtype in ["int64","float64"]:
        numericalCols.append(col)
    elif X[col].dtype=="object":
        categoricalCols.append(col)

  
print(numericalCols)
print(categoricalCols)

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [82]:
numericalPipeline=Pipeline(steps=[
    ('minMaxScaler',MinMaxScaler())
])

categoricalPipeline=Pipeline(steps=[
    ("oneHotEncoder",OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

preprocessor=ColumnTransformer(
    transformers=[
        ('num',numericalPipeline,numericalCols),
        ('cat',categoricalPipeline,categoricalCols)
    ]
)




In [83]:

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(
        max_depth=6,          
        min_samples_split=2,      
        min_samples_leaf=10,       
        max_features=None,        
        criterion='entropy',        
        splitter='best',          
        random_state=42,          
        class_weight="balanced"         
    ),
    "SVM":SVC(
         kernel='rbf',
        C=1.0,
        gamma='scale',
        class_weight='balanced',
        probability=True,
        random_state=42
    )
    
}

for name,model in models.items():
        pipeline=Pipeline(steps=[
                ('preprocessor',preprocessor),
                ('classificationModel',model)
        
        ])

        

        pipeline.fit(X_train,y_train)

        

        y_pred=pipeline.predict(X_test)
        print(name)
        print("Accuracy: ",accuracy_score(y_test,y_pred))
        print("Mean Absolute Error: ",mean_absolute_error(y_test,y_pred))
        print("Mean Squared Error: ",mean_squared_error(y_test,y_pred))
        print("Confusion Matirx: \n",confusion_matrix(y_test,y_pred))
        print("-------------------------------------------------")
    



LogisticRegression
Accuracy:  0.9460285132382892
Mean Absolute Error:  0.0539714867617108
Mean Squared Error:  0.0539714867617108
Confusion Matirx: 
 [[929   0]
 [ 53   0]]
-------------------------------------------------
DecisionTree
Accuracy:  0.7474541751527495
Mean Absolute Error:  0.2525458248472505
Mean Squared Error:  0.2525458248472505
Confusion Matirx: 
 [[699 230]
 [ 18  35]]
-------------------------------------------------
SVM
Accuracy:  0.7464358452138493
Mean Absolute Error:  0.2535641547861507
Mean Squared Error:  0.2535641547861507
Confusion Matirx: 
 [[697 232]
 [ 17  36]]
-------------------------------------------------


### We'll be using SVM for our prediction 

In [84]:


finalPipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ("model",SVC(kernel='rbf',C=1.0,gamma='scale',class_weight='balanced',probability=True,random_state=42))
])

finalPipeline.fit(X_train,y_train)


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [85]:
import joblib
joblib.dump(finalPipeline,"model.joblib")

['model.joblib']