In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load csv into a DataFrame
prediction_df = pd.read_csv('full_data.csv')
prediction_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [3]:
# Make a copy of the dataframe and drop unnecessary columns and target column
features_df = prediction_df.copy().drop(columns=['ever_married', 'work_type',
                                                 'Residence_type', 'stroke'])

# Convert categorical data to numerical
dummies = pd.get_dummies(features_df[['gender', 'smoking_status']])

# Combine dummies and features_df and drop the original columns that have been split
features_df = pd.concat([features_df, dummies], axis=1)
features_df = features_df.drop(columns=['gender', 'smoking_status'])
features_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,0,1,0,1,0,0
1,80.0,0,1,105.92,32.5,0,1,0,0,1,0
2,49.0,0,0,171.23,34.4,1,0,0,0,0,1
3,79.0,1,0,174.12,24.0,1,0,0,0,1,0
4,81.0,0,0,186.21,29.0,0,1,0,1,0,0


In [4]:
# Target Column
y = prediction_df['stroke']

# Feature Columns
X = features_df

In [5]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [6]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a Logistic Regression model
model = LogisticRegression(solver='lbfgs', max_iter=200)

# Fit the model using training data
model.fit(X_train_scaled, y_train)

In [8]:
# Make predictions using the testing data
predictions = model.predict(X_test_scaled)

In [9]:
# Print the balanced_accuracy score of the model
balancedAccuracy = balanced_accuracy_score(y_test, predictions)
print(f"Balanced Accuracy Score: {balancedAccuracy * 100:.2f}%")

Balanced Accuracy Score: 50.00%


In [10]:
# Print the accuracy score
accuracyScore = accuracy_score(y_test, predictions)
print(f"Accuracy Score: {accuracyScore * 100:.2f}%")

Accuracy Score: 95.02%


In [11]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1184
           1       0.00      0.00      0.00        62

    accuracy                           0.95      1246
   macro avg       0.48      0.50      0.49      1246
weighted avg       0.90      0.95      0.93      1246



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Create the random forest classifier model
rfModel = RandomForestClassifier(n_estimators=500, random_state=42)

In [13]:
# Fit the model on the scaled training data
rfModel = rfModel.fit(X_train_scaled, y_train)

In [14]:
# Make a list of predictions using the scaled testing data
rfPredictions = rfModel.predict(X_test_scaled)
rfPredictions[:3]

array([0, 0, 0])

In [15]:
# Print the balanced_accuracy score of the model
rfBalancedAccuracy = balanced_accuracy_score(y_test, rfPredictions)
print(f"Random Forest Balanced Accuracy Score: {balancedAccuracy * 100:.2f}%")

Random Forest Balanced Accuracy Score: 50.00%


In [16]:
# Print the accuracy score
rfAccuracyScore = accuracy_score(y_test, rfPredictions)
print(f"Random Forest Accuracy Score: {rfAccuracyScore * 100:.2f}%")

Random Forest Accuracy Score: 94.86%


In [17]:
import tensorflow as tf

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_of_inputs = len(X_train_scaled[0])

hidden_nodes_layer_1 = 15
hidden_nodes_layer_2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer_1, activation="relu", input_dim=number_of_inputs))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer_2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 15)                180       
                                                                 
 dense_1 (Dense)             (None, 5)                 80        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 266 (1.04 KB)
Trainable params: 266 (1.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
# Train the model
nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7a1d40c6b910>

In [20]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1736 - accuracy: 0.9486 - 359ms/epoch - 9ms/step
Loss: 0.17361515760421753, Accuracy: 0.9486356377601624


In [21]:
# make the function for hypertuning

def createModel(hp):

  # set up the sequential model
  nn = tf.keras.models.Sequential()

  # set up the choices for the activation functions needed in the hidden layers
  activationList = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

  # loop through a number of attributes on the first layer

  nn.add(
      tf.keras.layers.Dense(
          input_dim=len(X_train_scaled[0]),
          activation=activationList,
          units=hp.Int('first_units', min_value=1, max_value=25)
      ))

  # test different numbers of layers as well as neurons on those hidden layers
  for num in range(hp.Int('num_layers', 1, 11)):

    # add the layer (or layers) with a neuron (or neurons) on each layer
    nn.add(
        tf.keras.layers.Dense(
            activation=activationList,
            units=hp.Int(
                'units_' + str(num),
                min_value=1,
                max_value=25,
            )
        )
    )

  # set up the output layer (1 node)
  nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

  # compile the model
  nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  # return the model to used in the iteration
  return nn

In [22]:
pip install -q keras-tuner

In [23]:
# import the keras-tuner modules
import keras_tuner as kt

In [24]:
# use kt Hyperband function to run the optimizer
tuner = kt.Hyperband(
    createModel,
    objective='val_accuracy',
    max_epochs=50,
    hyperband_iterations=2
)

Reloading Tuner from ./untitled_project/tuner0.json


In [25]:
# use the .search() function to find the best combo of hyperparameters
tuner.search(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test))

In [26]:
bestDLModel = tuner.get_best_models(1)[0]

DLModel02Loss, DLModel02Accuracy = bestDLModel.evaluate(X_test_scaled, y_test)

