In [1]:
import pandas as pd
import os
import kagglehub
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
import keras_tuner as kt
from sklearn.preprocessing import StandardScaler

# Initialize Spark session
spark = SparkSession.builder.appName("AlzheimersDataset").getOrCreate()

# Download the latest version of the dataset
dataset_path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")

print("Dataset path:", dataset_path)

# List files in the dataset directory
files = os.listdir(dataset_path)
print("Files in dataset:", files)

# Choose the correct CSV file (replace 'your_file.csv' with the actual file name)
csv_file = [f for f in files if f.endswith('.csv')]
if not csv_file:
    raise FileNotFoundError("No CSV file found in the dataset directory.")
csv_file_path = os.path.join(dataset_path, csv_file[0])

# Load the CSV file into a Spark DataFrame
df_spark = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Display the first few rows
df_spark.show(5)


Dataset path: C:\Users\qjone\.cache\kagglehub\datasets\rabieelkharoua\alzheimers-disease-dataset\versions\1
Files in dataset: ['alzheimers_disease_data.csv']
+---------+---+------+---------+--------------+------------------+-------+------------------+-----------------+------------------+-----------------+-----------------------+---------------------+--------+----------+----------+------------+----------+-----------+------------------+-----------------+-----------------+------------------------+------------------+--------------------+----------------+------------------+--------------------+---------+--------------+------------------+-------------------------+-------------+---------+--------------+
|PatientID|Age|Gender|Ethnicity|EducationLevel|               BMI|Smoking|AlcoholConsumption| PhysicalActivity|       DietQuality|     SleepQuality|FamilyHistoryAlzheimers|CardiovascularDisease|Diabetes|Depression|HeadInjury|Hypertension|SystolicBP|DiastolicBP|  CholesterolTotal|   Cholestero

In [2]:
df = df_spark.toPandas()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int32  
 1   Age                        2149 non-null   int32  
 2   Gender                     2149 non-null   int32  
 3   Ethnicity                  2149 non-null   int32  
 4   EducationLevel             2149 non-null   int32  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int32  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int32  
 12  CardiovascularDisease      2149 non-null   int32  
 13  Diabetes                   2149 non-null   int32

In [4]:
all_columns = df.columns.tolist()

for column in all_columns:
    print(f'These are the value counts for the column named: {column}')
    print(df[column].value_counts())

These are the value counts for the column named: PatientID
PatientID
4751    1
6179    1
6193    1
6192    1
6191    1
       ..
5462    1
5461    1
5460    1
5459    1
6899    1
Name: count, Length: 2149, dtype: int64
These are the value counts for the column named: Age
Age
88    84
68    84
72    82
76    81
71    80
90    79
67    77
60    74
70    74
66    73
89    72
77    72
78    72
84    71
83    71
62    70
63    69
80    68
61    68
87    68
82    68
73    66
65    64
75    64
69    63
64    59
79    57
85    57
81    57
74    55
86    50
Name: count, dtype: int64
These are the value counts for the column named: Gender
Gender
1    1088
0    1061
Name: count, dtype: int64
These are the value counts for the column named: Ethnicity
Ethnicity
0    1278
1     454
3     211
2     206
Name: count, dtype: int64
These are the value counts for the column named: EducationLevel
EducationLevel
1    854
2    636
0    446
3    213
Name: count, dtype: int64
These are the value counts for the

In [5]:
df.describe()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,5825.0,74.908795,0.506282,0.697534,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,620.507185,8.990221,0.500077,0.996128,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,4751.0,60.0,0.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,5288.0,67.0,0.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,5825.0,75.0,1.0,0.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,6362.0,83.0,1.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,6899.0,90.0,1.0,3.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int32  
 1   Age                        2149 non-null   int32  
 2   Gender                     2149 non-null   int32  
 3   Ethnicity                  2149 non-null   int32  
 4   EducationLevel             2149 non-null   int32  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int32  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int32  
 12  CardiovascularDisease      2149 non-null   int32  
 13  Diabetes                   2149 non-null   int32

In [7]:
# Create a copy of the dataframe
df_copy = df.copy()

# Step 1: Drop uneeded columns
df_copy = df_copy.drop(columns=(['PatientID', 'DoctorInCharge']))

# Step 2: Define features and target
X = df_copy.drop(columns='Diagnosis')
y = df_copy['Diagnosis']

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize the model
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_test)
# Step 10: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Additional classification report
print(classification_report(y_test, y_pred))

# If you need to further evaluate feature importances
feature_importances = best_rf.feature_importances_
features = X.columns

# Create a DataFrame for better readability
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Accuracy: 0.9465
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       277
           1       0.96      0.88      0.92       153

    accuracy                           0.95       430
   macro avg       0.95      0.93      0.94       430
weighted avg       0.95      0.95      0.95       430

                      Feature  Importance
23       FunctionalAssessment    0.202252
26                        ADL    0.176325
22                       MMSE    0.128243
24           MemoryComplaints    0.089677
25         BehavioralProblems    0.050370
8                 DietQuality    0.027584
21   CholesterolTriglycerides    0.027234
20             CholesterolHDL    0.026933
7            PhysicalActivity    0.026467
9                SleepQuality    0.026458
18           CholesterolTotal    0.026181
4                         BMI    0.025284
19             CholesterolLDL    0.024569
6    

In [None]:
# Create my df
my_df = df_copy.copy()

# Declare dcolumns to drop


# Grab only my columns to evaluate
my_df = my_df.drop()

# Define features and target
X = df_copy.drop(columns='Diagnosis')
y = df_copy['Diagnosis']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Check dimensions
print(f"Training Data Shape: {X_train_scaled.shape}")
print(f"Test Data Shape: {X_test_scaled.shape}")

Training Data Shape: (1719, 32)
Test Data Shape: (430, 32)


In [12]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide activation function
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # First hidden layer
    nn_model.add(tf.keras.layers.Dense(
        units=hp.Int('first_units', min_value=25, max_value=200, step=25),
        activation=activation,
        input_dim=32
    ))

    # Hidden layers (1 to 4)
    for i in range(hp.Int('num_layers', 1, 4)):
        units = hp.Int(f'units_{i}', min_value=35, max_value=300, step=25)
        nn_model.add(tf.keras.layers.Dense(units, activation=activation))

    # Output layer
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile with tunable optimizer
    optimizer = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    nn_model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

    return nn_model


In [13]:
# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Define the Hyperband tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=70,  
    hyperband_iterations=8  
)

Reloading Tuner from .\untitled_project\tuner0.json


In [14]:
# Run the Keras Tuner search
tuner.search(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping])

Trial 714 Complete [00h 00m 05s]
val_accuracy: 0.8441860675811768

Best val_accuracy So Far: 0.8581395149230957
Total elapsed time: 01h 20m 18s


In [19]:
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)


top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

{'activation': 'tanh', 'first_units': 125, 'num_layers': 4, 'units_0': 185, 'optimizer': 'rmsprop', 'units_1': 160, 'units_2': 210, 'units_3': 110, 'tuner/epochs': 24, 'tuner/initial_epoch': 8, 'tuner/bracket': 3, 'tuner/round': 2, 'tuner/trial_id': '0034'}
{'activation': 'tanh', 'first_units': 100, 'num_layers': 4, 'units_0': 235, 'optimizer': 'rmsprop', 'units_1': 285, 'units_2': 260, 'units_3': 285, 'tuner/epochs': 24, 'tuner/initial_epoch': 8, 'tuner/bracket': 3, 'tuner/round': 2, 'tuner/trial_id': '0211'}
{'activation': 'tanh', 'first_units': 150, 'num_layers': 3, 'units_0': 285, 'optimizer': 'adam', 'units_1': 85, 'units_2': 110, 'units_3': 160, 'tuner/epochs': 8, 'tuner/initial_epoch': 3, 'tuner/bracket': 3, 'tuner/round': 1, 'tuner/trial_id': '0272'}


ValueError: in user code:

    File "c:\Users\qjone\anaconda3\envs\dev\lib\site-packages\keras\src\engine\training.py", line 2066, in test_function  *
        return step_function(self, iterator)
    File "c:\Users\qjone\anaconda3\envs\dev\lib\site-packages\keras\src\engine\training.py", line 2049, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\qjone\anaconda3\envs\dev\lib\site-packages\keras\src\engine\training.py", line 2037, in run_step  **
        outputs = model.test_step(data)
    File "c:\Users\qjone\anaconda3\envs\dev\lib\site-packages\keras\src\engine\training.py", line 1917, in test_step
        y_pred = self(x, training=False)
    File "c:\Users\qjone\anaconda3\envs\dev\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\qjone\anaconda3\envs\dev\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 32), found shape=(None, 8)


In [17]:
# Create my df
my_df = df.copy()

# Grab only my columns to evaluate
my_df = my_df.filter(items=['DietQuality', 'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'Diagnosis'])

my_df.head()

Unnamed: 0,DietQuality,SleepQuality,FamilyHistoryAlzheimers,CardiovascularDisease,Diabetes,Depression,HeadInjury,Hypertension,Diagnosis
0,1.347214,9.025679,0,0,1,1,0,0,0
1,0.518767,7.151293,0,0,0,0,0,0,0
2,1.826335,9.673574,1,0,0,0,0,0,0
3,7.435604,8.392554,0,0,0,0,0,0,0
4,0.795498,5.597238,0,0,0,0,0,0,0


In [18]:
# Define features and target
X = my_df.drop(columns='Diagnosis')
y = my_df['Diagnosis']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Check dimensions
print(f"Training Data Shape: {X_train_scaled.shape}")
print(f"Test Data Shape: {X_test_scaled.shape}")

Training Data Shape: (1719, 8)
Test Data Shape: (430, 8)


In [20]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide activation function
    activation = hp.Choice('activation', ['relu', 'tanh', 'sigmoid'])

    # First hidden layer
    nn_model.add(tf.keras.layers.Dense(
        units=hp.Int('first_units', min_value=25, max_value=200, step=25),
        activation=activation,
        input_dim=8
    ))

    # Hidden layers (1 to 4)
    for i in range(hp.Int('num_layers', 1, 4)):
        units = hp.Int(f'units_{i}', min_value=35, max_value=300, step=25)
        nn_model.add(tf.keras.layers.Dense(units, activation=activation))

    # Output layer
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile with tunable optimizer
    optimizer = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    nn_model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

    return nn_model

In [22]:
# Reset tuner if needed
import shutil
import os

# Define the tuner directory based on output
tuner_dir = os.path.join(".", "untitled_project")

# Delete the tuner directory
shutil.rmtree(tuner_dir, ignore_errors=True)

# Confirm deletion
print("Deleted tuner directory:", not os.path.exists(tuner_dir))

Deleted tuner directory: True


In [23]:
# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Define the Hyperband tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=70,  
    hyperband_iterations=8  
)

In [24]:
# Run the Keras Tuner search
tuner.search(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping])

Trial 151 Complete [00h 00m 06s]
val_accuracy: 0.6465116143226624

Best val_accuracy So Far: 0.6581395268440247
Total elapsed time: 00h 31m 15s

Search: Running Trial #152

Value             |Best Value So Far |Hyperparameter
sigmoid           |tanh              |activation
50                |150               |first_units
3                 |1                 |num_layers
135               |235               |units_0
rmsprop           |adam              |optimizer
85                |135               |units_1
35                |35                |units_2
135               |85                |units_3
8                 |8                 |tuner/epochs
0                 |3                 |tuner/initial_epoch
2                 |3                 |tuner/bracket
0                 |1                 |tuner/round

Epoch 1/8


KeyboardInterrupt: 

In [None]:
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)


top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")