In [None]:
#importation
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from xgboost import XGBClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [None]:
#data processing
#machine learning
#test/outcome

pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", 100)

In [None]:
df = pd.read_csv("C:....\\dataset.csv")
pp.ProfileReport(df)

 ##### 1- unbalanced data
 ##### 2- delete id colomun
 ##### 3- unbalanced data
 ##### 4- delete or replace nan values 
 ##### 5 - deal with multi correlation

In [None]:
df.drop("id",axis=1,inplace=True)
df.drop(df.loc[df["gender"]=="Other"].index,inplace=True)
df["gender"].replace(['Male','Female'],[0,1],inplace = True)
df["ever_married"].replace(['No','Yes'],[0,1],inplace = True)
df["work_type"].unique()
df["work_type"].replace(["children","Private","Never_worked","Self-employed","Govt_job"],[0,1,2,3,4],inplace=True)
df["Residence_type"].replace(["Rural","Urban"],[0,1],inplace=True)
df["smoking_status"] = LabelEncoder().fit_transform(df["smoking_status"])
df

In [None]:
# check for outliers
df["bmi"].plot(kind="box",figsize=(10,8))
plt.show()
# we have many outliers in bmi columun so filling it with the mean could be a bad idea
df["bmi"].fillna(df["bmi"].mean(),inplace = True)
# checking that no null value remain
df.info()

In [None]:
# checking multicorrelation 
corrmat = df.corr()
plt.figure(figsize=(15,15))
sns.heatmap(corrmat,annot=True,cmap="RdYlGn")

In [None]:
# feature extraction (look for the most significante independante variables)
model = ExtraTreesClassifier()
model.fit(x,y)
feat = pd.Series(model.feature_importances_,index=x.columns)
feat.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X = df.iloc[:,0:10]
Y = df["stroke"]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33,random_state=0)

In [None]:
# to fix unbalanced data i'm gona use oversampling approache 
# Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='minority', random_state=42)
# Fit the model to generate the data.
oversampled_X, oversampled_Y = sm.fit_resample(df.drop('stroke', axis=1), df['stroke'])
DFoversampled = pd.concat([pd.DataFrame(oversampled_X), pd.DataFrame(oversampled_Y)], axis=1)
# checking
DF = DFoversampled
x = ["Stroke","Non stroke"]
y = [len(DF[DF["stroke"]==0]),len(DF[DF["stroke"]==1])]
plt.bar(x,y)

In [None]:
# making 2 samples of data (unbalanced and blanaced) so i can compare 
XB = DF.iloc[:,0:10] #B = X_Balanced
YB = DF["stroke"] 
XB_train,XB_test,YB_train,YB_test = train_test_split(XB,YB,test_size=0.33,random_state=0)
# for svm one class and (df) data frame still (unbalanced)
XU = df.iloc[:,0:10] # U = X_Unbalanced
YU= df.iloc[:,-1]
XU_train,XU_test,YU_train,YU_test = train_test_split(XU,YU,test_size=0.33,random_state=0)

In [None]:
models = [{
    
    "model": LogisticRegression(),
    "label": "LogisticRegression"
    
},
{
  "model": SVC(),
  "label": "SVC" 
},
{
  "model": KNeighborsClassifier(),
  "label": "KNeighborsClassifier" 
},
{
  "model": DecisionTreeClassifier(),
  "label": "DecisionTreeClassifier"
    
},
{
    "model" : RandomForestClassifier(),
    "label" : " RandomForestClassifier"
},
{
    "model" : XGBClassifier(),
    "label":"XGBClassifier"
}]

In [None]:
def plot_conf_matrix(model,Y_test,Y_pred):
    cm=confusion_matrix(Y_test,Y_pred)
    plt.figure(figsize=(10,3))
    plt.title("Confusion Matrix"+str(model))
    sns.heatmap(cm, annot=True,fmt='d', cmap='Blues')
    plt.ylabel("Actual Values")
    plt.xlabel("Predicted Values")
    plt.show()
def get_score(model,X_train,X_test,Y_train,Y_test):
    model.fit(X_train,Y_train)
    Y_pred = model.predict(X_test)
    plot_conf_matrix(model,Y_test,Y_pred)
    print(classification_report(Y_test,Y_pred))
    print("F1_SCORE :",f1_score(Y_test, Y_pred) )
    return model.score(X_test,Y_test)

In [None]:
# stratified kfold ( good for unbalanced data) + unbalanced data + all models
Score_logistic=[]
Score_svm =[]
Score_KN = []
Score_DT = []
Score_RF = []
Score_xgb = []

from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=10)
for train_index , test_index in folds.split(XU,YU):
    X_train , X_test , Y_train , Y_test = XU.iloc[train_index],XU.iloc[test_index],YU.iloc[train_index],YU.iloc[test_index]
    
    Score_logistic.append(get_score(LogisticRegression(),X_train,X_test,Y_train,Y_test))
    
    Score_svm.append(get_score(SVC(),X_train,X_test,Y_train,Y_test))
    
    Score_KN.append(get_score(KNeighborsClassifier(),X_train,X_test,Y_train,Y_test))
    
    Score_DT.append(get_score(DecisionTreeClassifier(),X_train,X_test,Y_train,Y_test))
    
    Score_RF.append(get_score(RandomForestClassifier(),X_train,X_test,Y_train,Y_test))
    
    Score_xgb.append(get_score(XGBClassifier(),X_train,X_test,Y_train,Y_test

In [None]:
# balanced data
acc = []
for m in models:
    acc.append({"model":m["label"],"Acc :":get_score(m["model"],XB_train,XB_test,YB_train,YB_test)})
for i in acc:
    print(i,end="\n")

In [None]:
# svm one class
print(get_score(OneClassSVM(kernel='rbf', gamma=0.001, nu=0.02),XU_train,XU_test,YU_train,YU_test))

In [None]:
# balanced data is much better and the best model is XGBoost
#save the model 
import pickle
model = XGBClassifier().fit(XB,YB)
pickle.dump(model,open(r"C:\wamp64\www\....pkl","wb"))
