In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as mis
import os 
import seaborn as sns
import plotly.express as px


import warnings
warnings.filterwarnings("ignore")

# metrics
from sklearn.metrics import confusion_matrix , mean_squared_error , classification_report

# preprocessing
from sklearn.model_selection import train_test_split , cross_val_score , cross_val_predict , GridSearchCV , RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.impute import SimpleImputer


## pipeline
from sklearn.pipeline import Pipeline ,FeatureUnion
from sklearn_features.transformers import DataFrameSelector


#ensemble
from sklearn.ensemble import RandomForestClassifier


#xgboost
import xgboost as xgp

# ANN Neural Network
from keras import layers , Sequential 
from tensorflow.keras.layers import Dense


## Open File csv

In [None]:
path = r"D:\ML_data\full_data.csv"

if os.path.exists(path):
    df = pd.read_csv(path)
    print(df.head())
    
else:
    print("Sorry , your file path not found.")

In [None]:
df.head()

In [None]:
df.tail()

# Gender: This feature denotes the gender of the individual, categorized as either male or female.

# Age: The age of the individual in years.

# Hypertension: Indicates whether the individual has hypertension (high blood pressure) or not. It's binary, where 1 represents the presence of hypertension and 0 represents the absence.

# Heart Disease: Denotes whether the individual has heart disease or not. Similarly binary, with 1 indicating the presence and 0 indicating the absence.

# Ever Married: This feature indicates whether the individual has ever been married or not, categorized as "Yes" or "No".

# Work Type: Describes the type of work the individual is engaged in, such as Private, Self-employed, etc.

# Residence Type: Denotes whether the individual resides in an urban or rural area.

# Average Glucose Level: The average glucose level measured in the blood.

# BMI (Body Mass Index): Body mass index is a value derived from the mass and height of an individual. It's a measure of body fat based on height and weight.

# Smoking Status: Indicates the smoking status of the individual, categorized as "formerly smoked", "never smoked", or "smokes".

# Stroke: This is the target variable, indicating whether the individual has had a stroke (1) or not (0).


# Summary:

# The dataset contains information on 5,000 individuals.
# There are both categorical and numerical features.
# Categorical features include Gender, Ever Married, Work Type, Residence Type, and Smoking Status.
# Numerical features include Age, Average Glucose Level, and BMI.
# Target variable (dependent variable) is Stroke, which we aim to predict based on other features.
# Understanding these features can help in various analyses such as predicting the likelihood of stroke based on demographic and health-related factors. Additionally, it can be used to understand correlations between different factors and occurrences of strokes.






In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.describe(include=object).T

In [None]:
df.isnull().sum()

In [None]:
print('Missing values: ',sum(df.isnull().sum()))

In [None]:
mis.matrix(df)

In [None]:
mis.bar(df)

# EDA (Exploratory data analysis)

In [None]:
df["stroke"].value_counts()

In [None]:
counts = [4733, 248]
labels = ["0", "1"]

plt.figure(figsize=(10,8))
plt.pie(counts, labels=labels, autopct="%1.1f%%")
plt.axis("equal")
plt.title("Distribution of stroke status")
plt.show()

In [None]:
counts = [4733, 248]

plt.figure(figsize=(10, 6))
sns.barplot(counts)
plt.title("Barplot of stroke status")
plt.xlabel("Stroke Status")
plt.ylabel("Counts")
plt.show()

In [None]:
df["gender"].value_counts()

In [None]:
counts = [2907, 2074]
labels = ["Female", "Male"]

plt.figure(figsize=(10,8))
plt.pie(counts , labels=labels , autopct="%1.1f%%")
plt.axis("equal")
plt.title("Distribution of gender status")
plt.show()

In [None]:
counts = [2907, 2074]
labels = ["Female", "Male"]

plt.figure(figsize=(10, 6))
sns.barplot(x=labels, y=counts)
plt.title("Barplot of gender status")
plt.xlabel("Gender")
plt.ylabel("Counts")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

# Scatter plot 1
sns.scatterplot(x=df["age"], y=df["bmi"], label="Scatterplot 1")

# Scatter plot 2
sns.scatterplot(x=df["age"], y=df["bmi"], label="Scatterplot 2")

# Scatter plot 3
sns.scatterplot(x=df["age"], y=df["bmi"], label="Scatterplot 3")


plt.title("Scatterplot of Age vs BMI")
plt.xlabel("Age")
plt.ylabel("BMI")
plt.legend()
plt.show()

In [None]:
plt.subplot(2,1,1)
sns.scatterplot(x=df['avg_glucose_level'],y=df['bmi'],hue=df['stroke'])
plt.title('Stroke Sample Distribution Based On Bmi And Glucose Level')

plt.subplot(2,1,2)
sns.scatterplot(x=df['age'],y=df['bmi'],hue=df['stroke'])
plt.title('Stroke Sample Distribution Based On Bmi And Age')
plt.tight_layout()
plt.show()

In [None]:
fig,axes = plt.subplots(4,2,figsize = (16,16))
sns.set_style('whitegrid')
fig.suptitle("Count plot for various categorical features")

sns.countplot(ax=axes[0,0],data=df,x='gender')
sns.countplot(ax=axes[0,1],data=df,x='hypertension')
sns.countplot(ax=axes[1,0],data=df,x='heart_disease')
sns.countplot(ax=axes[1,1],data=df,x='ever_married')
sns.countplot(ax=axes[2,0],data=df,x='work_type')
sns.countplot(ax=axes[2,1],data=df,x='Residence_type')
sns.countplot(ax=axes[3,0],data=df,x='smoking_status')
sns.countplot(ax=axes[3,1],data=df,x='stroke')


plt.show()

In [None]:
fig = px.box(data_frame = df,
            x = "avg_glucose_level",
            width = 800,
            height = 300)

fig.update_layout({"template":"plotly_dark"})
fig.show()

In [None]:
plt.figure(figsize = (10,6))
cr = df[df.columns].corr()
sns.heatmap(cr,cmap="viridis", annot = True)
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
df.plot(kind="hist", y="age", bins=70, color="b", ax=axes[0][0])
df.plot(kind="hist", y="bmi", bins=100, color="r", ax=axes[0][1])
df.plot(kind="hist", y="heart_disease", bins=6, color="g", ax=axes[1][0])
df.plot(kind="hist", y="avg_glucose_level", bins=100, color="orange", ax=axes[1][1])
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
df.plot(kind="hist", y="hypertension", bins=70, color="b", ax=axes[0][0])
df.plot(kind="hist", y="stroke", bins=100, color="r", ax=axes[0][1])
df.plot(kind="hist", y="bmi", bins=6, color="g", ax=axes[1][0])
df.plot(kind="hist", y="avg_glucose_level", bins=100, color="orange", ax=axes[1][1])
plt.show()

In [None]:
df._get_numeric_data()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
df.plot(kind='scatter', x='age', y='avg_glucose_level', alpha=0.5, color='green', ax=axes[0], title="Age vs. avg_glucose_level")
df.plot(kind='scatter', x='avg_glucose_level', y='bmi', alpha=0.5, color='red', ax=axes[1], title="avg_glucose_level vs. bmi")
plt.show()

# Data Summary ( Check for missing values )

In [None]:
print ("Rows     : " , df.shape[0])
print ("Columns  : " , df.shape[1])
print ("\nFeatures : \n" , df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n",df.nunique())

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
df.plot(kind='scatter', x='age', y='avg_glucose_level', alpha=0.5, color='green', ax=axes[0], title="Age vs. avg_glucose_level")
df.plot(kind='scatter', x='bmi', y='avg_glucose_level', alpha=0.5, color='red', ax=axes[1], title="bmi vs. avg_glucose_level")
plt.show()

In [None]:
sns.set(style="ticks");
pal = ["#FA5858", "#58D3F7"]

sns.pairplot(df, hue="stroke", palette=pal);
plt.title("stroke");


In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=df,x=df["bmi"]);

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=df,x='work_type',palette='cool');

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=df,x='Residence_type',palette='cool');

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=df,x='smoking_status',palette='cool');

In [None]:
df.hist(bins=25 , figsize=(15,10))
plt.show()

In [None]:
plt.figure(figsize=(18, 9))
ax = sns.violinplot(x='age', y='avg_glucose_level', hue='stroke', data=df, scale='count', palette={0: "b", 1: "orange"}, cut=0)
ax.set_xlabel('Age', fontsize=14)
ax.set_ylabel('Glucose level', fontsize=14)
ax.set_title('Age, Glucose vs Stroke', fontsize=20)
plt.show()

In [None]:
df._get_numeric_data().corr()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df._get_numeric_data().corr() , annot=True , cmap="Blues")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df, x=df["work_type"])

In [None]:
sns.pairplot(df , hue="stroke")
plt.show()

In [None]:
df.columns

In [None]:
for i in df._get_numeric_data():
    sns.distplot(df[i])
    plt.title(f'Distribution of {i}')
    plt.xlabel(i)
    plt.ylabel('Density')
    plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(df)
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
X = df.drop(columns=["stroke"])
y = df["stroke"]

In [None]:
X_train ,X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , shuffle=True , random_state=123)

In [None]:
print(f"X Train shape is == {X_train.shape}")
print(f"X test shape is == {X_test.shape}")
print(f"y train shape is == {y_train.shape}")
print(f"y test shape is == {y_test.shape}")

In [None]:
col_nums = [col for col in X_train.columns if df[col].dtype in ["int32" , "int64" , "float32","float64"]]

col_catg = [col for col in X_train.columns if df[col].dtype not in ["int32" , "int64" , "float32","float64"]]

if (col_nums) and (col_catg) :
    print(f"colunms nums is \n {col_nums}")
    print(f"colunms catg is \n {col_catg}")
          
elif col_catg:
    print(f"colunms catg is \n {col_catg}")

else:
    print(f"colunms nums is \n {col_nums}")


In [None]:
pip_num = Pipeline(steps=[
    ("select" , DataFrameSelector(col_nums),
    ("simple" , SimpleImputer(strategy="median"),
    ("scaler" , StandardScaler())))
])

In [None]:
pip_catg = Pipeline(steps=[
    ("Selector" , DataFrameSelector(col_catg),
    ("imputer" , SimpleImputer(strategy="constant" , fill_value="missing")),
    ("ont_hot" , LabelEncoder()))
])

In [None]:
def Preprocessing(col_nums, col_catg, X_train, X_test):
    
    pip_catg = Pipeline(steps=[
        ("Selector" , DataFrameSelector(col_catg)),
        ("imputer" , SimpleImputer(strategy="constant" , fill_value="missing")),
        ("one_hot" , LabelEncoder())
    ])
    
    pip_num = Pipeline(steps=[
        ("select" , DataFrameSelector(col_nums)),
        ("simple" , SimpleImputer(strategy="median")),
        ("scaler" , StandardScaler())
    ])

    total_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", pip_num),
        ("cate_pipeline", pip_catg)
    ])

    pip_num.fit(X_train)
    X_train_final = pip_num.transform(X_train)
    X_test_final = pip_num.transform(X_test)

    return X_train_final, X_test_final


X_train_final, X_test_final = Preprocessing(col_nums, col_catg, X_train, X_test)


print(f"X_train_final is = {X_train_final}")
print("*"*100)
print(f"X_test_final is = {X_test_final}")

In [None]:
df.hist(bins=25 , figsize=(15,10))
plt.show()

# Solve skewed by log transform¶

In [None]:
colms = (df._get_numeric_data()).columns
colms

In [None]:
colms = [cols for cols in (df._get_numeric_data()).columns]
for col in colms:
    df[col] = np.log(df[col] + 1e-10).astype(float)

In [None]:
df.hist(bins=20 , figsize=(10,8))
plt.show()

# RandomForestClassifier

In [None]:
Random = RandomForestClassifier()

In [None]:
# Define the hyperparameter grid
hyper_params = {
    "n_estimators": [i for i in range(100 ,500 ,100)],
    "max_depth": [i for i in range(3,15,3)],
    "criterion": ['entropy', 'gini', 'log_loss'],
}

# Create GridSearchCV object
random_hyper = GridSearchCV(
    estimator=Random,
    param_grid=hyper_params,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1
)

# Fit the model
random_model = random_hyper.fit(X_train_final, y_train)

In [None]:
print("Best parameters : \n", random_model.best_params_)

In [None]:
Rand_sea = random_model.best_estimator_
Rand_sea

In [None]:
corss_val = cross_val_score(estimator=Rand_sea, X=X_train_final, y=y_train,
                            scoring="balanced_accuracy", cv=5, n_jobs=-1)

print("Score is ==>>\n ", abs(corss_val))

print("cross val score mean is ==>> ", corss_val.mean())

In [None]:
cross_val_pred = cross_val_predict(estimator=Rand_sea ,X = X_train_final , y=y_train ,
                                  method="predict" , cv=5 , n_jobs=-1)


# Calculate mean squared error
mse = np.sqrt(mean_squared_error(y_train, cross_val_pred))

# Print the MSE
print(f"The mean squared error is {mse:.4f}")

In [None]:
X_train_final.shape[1]

In [None]:
df.columns


In [None]:
y_pred = random_model.predict(X_test_final)
y_pred[:5]

In [None]:
confusion = confusion_matrix(y_test , y_pred)
confusion


In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(confusion , fmt="g" , annot=True , cbar=True , vmin=0 , cmap="Blues")
plt.xticks(ticks=np.arange(2) + 0.5 , labels=["False" , "True"])
plt.yticks(ticks=np.arange(2) + 0.5 , labels=["False" , "True"])
plt.xlabel("Predicted" , fontsize=14 , color="b")
plt.ylabel("Actula" , fontsize=14 , color="b")
plt.title("Confusion Matrix" , fontsize=20 , color="m")
plt.show()

In [None]:
print(f" some details \n {classification_report(y_test , y_pred)}")


# xgboost

In [None]:
xg_clas = xgp.XGBClassifier()
xg_clas.fit(X_train_final , y_train)

In [None]:
cross_val_xgb = cross_val_score(estimator=xg_clas , X=X_train_final , y=y_train ,
                            cv=5 , scoring="accuracy" , n_jobs=-1)

 
print("Score is ==>>\n ",abs(cross_val_xgb))

print("cross val score mean is ==>> ",cross_val_xgb.mean())

In [None]:
# Make cross-validation predictions
cross_val_pred = cross_val_predict(estimator=xg_clas, X=X_train_final, y=y_train,
                                   cv=5, method="predict", n_jobs=-1)



# Calculate mean squared error
mse =np.sqrt(mean_squared_error(y_train, cross_val_pred))
# Print the MSE
print(f"The mean squared error is {mse:.4f}")

# Using Deep Learning

In [None]:
X_train_final[0].shape

In [None]:
# setting up the layers of Neural Network

model = Sequential([  
                          layers.Flatten(input_shape=(X_train_final[0].shape)),
                          layers.Dense(64, activation='relu'),
                          layers.Dense(7, activation='softmax')
])

model.summary()

In [None]:
# compiling the Neural Network

model.compile(optimizer="adam" ,
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [None]:
# training the Neural Network
history = model.fit(X_train_final , y_train , epochs=50)
history

In [None]:
# Evaluating the model

loss, accuracy = model.evaluate(X_test_final, y_test)
print("Accuracy is == " , accuracy)
print("loss is == " , np.round(loss , 4))

In [None]:
# The style
sns.set(style='whitegrid')

# Plotting training history
plt.figure(figsize=(15, 8))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', linestyle='-', marker='X', markersize=5)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', linestyle='-', marker='o', markersize=5)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# The Prediction
prediction = model.predict(X_test_final)
prediction

In [None]:
sns.histplot(y_test, color='blue', label='Actual', kde=True)
sns.histplot(prediction, color='red', label='Predicted', kde=True)
plt.xlabel("Values")
plt.ylabel("Frequency")
plt.title("Actual vs. Predicted Distribution")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,7))
class_index = 0  
predictions_class = prediction[:, class_index]

# Calculate residuals
residuals = y_test - predictions_class

# Plotting residuals
plt.scatter(predictions_class, residuals)
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.axhline(y=0, color='k', linestyle='--')
plt.show()

# END

# BY : SAYED ALI