In [29]:
#importing the data
library(tidyverse)
library(dplyr)

df <- read.csv("healthcare-dataset-stroke-data.csv", na.strings = "N/A")
head(df)


Unnamed: 0_level_0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<int>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<int>
1,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,51676,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
3,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
4,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
5,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
6,56669,Male,81,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


# New Section

In [30]:
#simple preprocessing
#removing id column
df <- df[, !(names(df) %in% "id")]
colnames(df)

In [31]:
df$ever_married = factor(df$ever_married)
df$work_type = factor(df$work_type)
df$Residence_type = factor(df$Residence_type)
df$smoking_status = factor(df$smoking_status)

In [32]:

#added later
df$hypertension <- as.numeric(factor(df$hypertension))
df$heart_disease <- as.numeric(factor(df$heart_disease))


In [33]:
df <- df[df$gender != "Other", ]
df$gender = factor(df$gender)

In [34]:
median_bmi <- median(df$bmi, na.rm = TRUE)
df$bmi[is.na(df$bmi)] <- median_bmi

In [35]:
#removing row with other- only 1 row for that
str(df)

'data.frame':	5109 obs. of  11 variables:
 $ gender           : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 2 1 1 1 ...
 $ age              : num  67 61 80 49 79 81 74 69 59 78 ...
 $ hypertension     : num  1 1 1 1 2 1 2 1 1 1 ...
 $ heart_disease    : num  2 1 2 1 1 1 2 1 1 1 ...
 $ ever_married     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
 $ work_type        : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
 $ Residence_type   : Factor w/ 2 levels "Rural","Urban": 2 1 1 2 1 2 1 2 1 2 ...
 $ avg_glucose_level: num  229 202 106 171 174 ...
 $ bmi              : num  36.6 28.1 32.5 34.4 24 29 27.4 22.8 28.1 24.2 ...
 $ smoking_status   : Factor w/ 4 levels "formerly smoked",..: 1 2 2 3 2 1 2 2 4 4 ...
 $ stroke           : int  1 1 1 1 1 1 1 1 1 1 ...


In [36]:
na_values <- sum(is.na(df))
na_values_per_column <- colSums(is.na(df))
na_values_per_column

In [None]:
install.packages('caret')


# # #min-max scaling of the numerical columns
# library(caret)
# # List of numeric columns you want to normalize
# numeric_columns <- c("age", "avg_glucose_level", "bmi")

# # Use preProcess to normalize
# preproc <- preProcess(df[, numeric_columns], method = c("range"))
# df_normalized <- predict(preproc, df[, numeric_columns])

# # Replace the original columns with the standardized data
# df[, numeric_columns] <- df_normalized


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [None]:
install.packages('ROSE')

library(ROSE)
library(caret)

#split the data into training and testing set
set.seed(25) # for reproducibility
sample_size <- floor(0.80 * nrow(df)) # 75% for training
train_indices <- sample(seq_len(nrow(df)), size = sample_size)

train_set <- df[train_indices, ]
test_set <- df[-train_indices, ]

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [None]:

train_set$stroke <- factor(train_set$stroke, levels = c(0, 1))
test_set$stroke <- factor(test_set$stroke, levels = c(0, 1))

In [None]:
balanced_train_set <- ovun.sample(stroke ~ ., data = train_set, method = "over")$data

In [None]:
install.packages('keras')
library(keras)

In [None]:
library(caret)
library(keras)

In [None]:
str(balanced_train_set)

In [16]:
library(caret)
library(dplyr)

X_train <- select(balanced_train_set, -stroke)  # Exclude the response variable
y_train <- balanced_train_set$stroke  # Directly access the column

# Convert y_train to numeric if it's a factor and not already numeric
y_train <- as.numeric(as.character(y_train))

# Apply dummyVars without fullRank to include all levels
dummyVarsOut <- dummyVars(~., data = X_train, fullRank = FALSE)
X_train <- predict(dummyVarsOut, X_train)

# Ensure X_train is a matrix, which is what Keras expects
X_train <- as.matrix(X_train)

preProcValues <- preProcess(X_train, method = c("center", "scale"))
X_train <- predict(preProcValues, X_train)


In [17]:
if(any(sapply(X_train, is.character)) | any(sapply(y_train, is.character))) {
  stop("Error: String data detected in the training set. Please ensure all data is numeric.")
}

In [18]:
model <- keras_model_sequential() %>%
  layer_dense(units = 1024, activation = "relu", input_shape = ncol(X_train)) %>%
  layer_batch_normalization() %>%
  layer_dropout(rate = 0.2) %>%

  layer_dense(units = 512, activation = "relu") %>%
  layer_batch_normalization() %>%
  layer_dropout(rate = 0.2) %>%

  layer_dense(units = 256, activation = "relu") %>%
  layer_batch_normalization() %>%
  layer_dropout(rate = 0.2) %>%

  layer_dense(units = 128, activation = "relu") %>%
  layer_batch_normalization() %>%
  layer_dropout(rate = 0.2) %>%

  layer_dense(units = 64, activation = "relu") %>%
  layer_batch_normalization() %>%
  layer_dropout(rate = 0.2) %>%

  layer_dense(units = 32, activation = "relu") %>%
  layer_batch_normalization() %>%
  layer_dropout(rate = 0.2) %>%

  layer_dense(units = 1, activation = "sigmoid")  # For binary classification

# Compile the model
optimizer <- optimizer_rmsprop(learning_rate = 0.001)
model %>% compile(
  optimizer = optimizer,
  loss = 'binary_crossentropy',
  metrics = c('accuracy')
)


# Summary of the model
summary(model)

early_stopping <- callback_early_stopping(
  monitor = "val_loss",     # Monitor the validation loss
  patience = 25,            # Number of epochs with no improvement after which training will be stopped
  restore_best_weights = TRUE  # Restore model weights from the epoch with the best value of the monitored quantity
)

# Fit the model to the training data, might consider adjusting epochs and batch_size
history <- model %>% fit(
  X_train,
  y_train,
  epochs = 150, # Increased number of epochs
  batch_size = 64, # Adjusted batch size
  validation_split = 0.2,
  callbacks = list(early_stopping)
)

Model: "sequential"
________________________________________________________________________________
 Layer (type)                  Output Shape               Param #    Trainable  
 dense_6 (Dense)               (None, 1024)               21504      Y          
 batch_normalization_5 (Batch  (None, 1024)               4096       Y          
 Normalization)                                                                 
 dropout_5 (Dropout)           (None, 1024)               0          Y          
 dense_5 (Dense)               (None, 512)                524800     Y          
 batch_normalization_4 (Batch  (None, 512)                2048       Y          
 Normalization)                                                                 
 dropout_4 (Dropout)           (None, 512)                0          Y          
 dense_4 (Dense)               (None, 256)                131328     Y          
 batch_normalization_3 (Batch  (None, 256)                1024       Y          
 Normali

In [19]:
library(caret)
library(dplyr)

# Assuming test_set_processed is your test dataset and has been defined similarly to balanced_train_set
X_test <- select(test_set, -stroke)  # Exclude the target variable
y_test <- test_set$stroke  # Access the target column

# Convert y_test to numeric if it's a factor and not already numeric
y_test <- as.numeric(as.character(y_test))

# Use the same dummyVars object (dummyVarsOut) created for X_train to transform X_test
# This ensures that X_test is processed with the exact same feature encoding as X_train
X_test <- predict(dummyVarsOut, newdata = X_test)

# Ensure X_test is a matrix, which is what Keras expects
X_test <- as.matrix(X_test)
X_test <- predict(preProcValues, X_test)

In [20]:
# Evaluate the model on the test data
evaluation_results <- model %>% evaluate(X_test, y_test, verbose = 0)

# Since the structure of evaluation_results might not be a named list, use indexing
# Normally, the first element is loss, and the second is accuracy, but this can depend on how you've compiled your model
cat("Test Loss:", evaluation_results[1], "\n")
cat("Test Accuracy:", evaluation_results[2], "\n")


Test Loss: 0.5783638 
Test Accuracy: 0.907045 


In [26]:
predictions = predict(model, X_test)

In [27]:
conf = confusionMatrix(table(predictions, y_test))
print(conf)

ERROR: Error in !all.equal(nrow(data), ncol(data)): invalid argument type


In [28]:
predictions

0
9.902951e-04
8.974259e-01
2.949018e-06
1.332302e-04
6.024721e-06
1.435361e-04
2.175874e-04
4.427740e-04
3.064098e-05
1.000238e-04


In [None]:
library(keras)

# Assume 'model' is your trained Keras model
save_model_hdf5(model, "neural_networks.h5")
