In [None]:
import pandas as pd
import os
import kagglehub
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("AlzheimersDataset").getOrCreate()

# Download the latest version of the dataset
dataset_path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")

print("Dataset path:", dataset_path)

# List files in the dataset directory
files = os.listdir(dataset_path)
print("Files in dataset:", files)

# Choose the correct CSV file (replace 'your_file.csv' with the actual file name)
csv_file = [f for f in files if f.endswith('.csv')]
if not csv_file:
    raise FileNotFoundError("No CSV file found in the dataset directory.")
csv_file_path = os.path.join(dataset_path, csv_file[0])

# Load the CSV file into a Spark DataFrame
df_spark = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Display the first few rows
df_spark.show(5)

df = df_spark.toPandas()


Dataset path: C:\Users\qjone\.cache\kagglehub\datasets\rabieelkharoua\alzheimers-disease-dataset\versions\1
Files in dataset: ['alzheimers_disease_data.csv']
+---------+---+------+---------+--------------+------------------+-------+------------------+-----------------+------------------+-----------------+-----------------------+---------------------+--------+----------+----------+------------+----------+-----------+------------------+-----------------+-----------------+------------------------+------------------+--------------------+----------------+------------------+--------------------+---------+--------------+------------------+-------------------------+-------------+---------+--------------+
|PatientID|Age|Gender|Ethnicity|EducationLevel|               BMI|Smoking|AlcoholConsumption| PhysicalActivity|       DietQuality|     SleepQuality|FamilyHistoryAlzheimers|CardiovascularDisease|Diabetes|Depression|HeadInjury|Hypertension|SystolicBP|DiastolicBP|  CholesterolTotal|   Cholestero

In [3]:
df = df_spark.toPandas()

In [4]:
# Check dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int32  
 1   Age                        2149 non-null   int32  
 2   Gender                     2149 non-null   int32  
 3   Ethnicity                  2149 non-null   int32  
 4   EducationLevel             2149 non-null   int32  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int32  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int32  
 12  CardiovascularDisease      2149 non-null   int32  
 13  Diabetes                   2149 non-null   int32

We can see that there are no missing values and the dataset seems to be clean. Let's dig a little deeper to confirm these suspicions

In [5]:
# Check column value counts
all_columns = df.columns.tolist()

for column in all_columns:
    print(f'These are the value counts for the column named: {column}')
    print(df[column].value_counts())

These are the value counts for the column named: PatientID
PatientID
4751    1
6179    1
6193    1
6192    1
6191    1
       ..
5462    1
5461    1
5460    1
5459    1
6899    1
Name: count, Length: 2149, dtype: int64
These are the value counts for the column named: Age
Age
88    84
68    84
72    82
76    81
71    80
90    79
67    77
60    74
70    74
66    73
89    72
77    72
78    72
84    71
83    71
62    70
63    69
80    68
61    68
87    68
82    68
73    66
65    64
75    64
69    63
64    59
79    57
85    57
81    57
74    55
86    50
Name: count, dtype: int64
These are the value counts for the column named: Gender
Gender
1    1088
0    1061
Name: count, dtype: int64
These are the value counts for the column named: Ethnicity
Ethnicity
0    1278
1     454
3     211
2     206
Name: count, dtype: int64
These are the value counts for the column named: EducationLevel
EducationLevel
1    854
2    636
0    446
3    213
Name: count, dtype: int64
These are the value counts for the

We can see that there are two columns that don't immediately look like they would be beneficial in our model training. These two columns are: "DoctorInCharge" because there is only one value in the entire column and "PatientID" becasue this columns is a non-benneficial identifier column. Futhermore, we are dropping the only column that is an object. Thus our data is mostly clean and has the correct datatypes for model training.

In [7]:
# Make a copy of the dataframe for dropping uneeded columns
clean_df = df.copy()

# Drop the two non-benificial columns 
clean_df = clean_df.drop(columns=['DoctorInCharge', 'PatientID'])

# Make sure teh columns were dropped
clean_df.columns.to_list()

['Age',
 'Gender',
 'Ethnicity',
 'EducationLevel',
 'BMI',
 'Smoking',
 'AlcoholConsumption',
 'PhysicalActivity',
 'DietQuality',
 'SleepQuality',
 'FamilyHistoryAlzheimers',
 'CardiovascularDisease',
 'Diabetes',
 'Depression',
 'HeadInjury',
 'Hypertension',
 'SystolicBP',
 'DiastolicBP',
 'CholesterolTotal',
 'CholesterolLDL',
 'CholesterolHDL',
 'CholesterolTriglycerides',
 'MMSE',
 'FunctionalAssessment',
 'MemoryComplaints',
 'BehavioralProblems',
 'ADL',
 'Confusion',
 'Disorientation',
 'PersonalityChanges',
 'DifficultyCompletingTasks',
 'Forgetfulness',
 'Diagnosis']

With a cleaned dataframe, we can now do a short exploratory analysis before training our model to try and find any other pieces of information that might be helpful. 

In [20]:
# Declare a variable to hold all the columns that have a range of values
clean_columns = ['BMI',
                 'AlcoholConsumption',
                 'PhysicalActivity',
                 'DietQuality',
                 'SleepQuality', 
                 'CholesterolTotal',
                 'CholesterolLDL',
                 'CholesterolHDL',
                 'CholesterolTriglycerides', 
                 'MMSE',
                 'FunctionalAssessment',
                 'ADL']


for column in clean_columns:
    print(f'The min/max values for {column} are:')
    print(f'Min: {clean_df[column].min()}')
    print(f'Max: {clean_df[column].max()}')
    print('---------------------------------------')

The min/max values for BMI are:
Min: 15.008851181631059
Max: 39.99276746402374
---------------------------------------
The min/max values for AlcoholConsumption are:
Min: 0.0020030991362718353
Max: 19.98929335906197
---------------------------------------
The min/max values for PhysicalActivity are:
Min: 0.003616016826019086
Max: 9.987429413422253
---------------------------------------
The min/max values for DietQuality are:
Min: 0.009384720116230039
Max: 9.99834567881401
---------------------------------------
The min/max values for SleepQuality are:
Min: 4.002628659826611
Max: 9.99984031668144
---------------------------------------
The min/max values for CholesterolTotal are:
Min: 150.09331559406317
Max: 299.99335247432657
---------------------------------------
The min/max values for CholesterolLDL are:
Min: 50.23070655980742
Max: 199.96566510142804
---------------------------------------
The min/max values for CholesterolHDL are:
Min: 20.00343401498445
Max: 99.98032407804152
----

In [18]:
# Create a new dataframe
non_binary_df = clean_df[clean_columns]

# check the new df
non_binary_df.head()

Unnamed: 0,BMI,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,MMSE,FunctionalAssessment,ADL
0,22.927749,13.297218,6.327112,1.347214,9.025679,242.36684,56.150897,33.682563,162.189143,21.463532,6.518877,1.725883
1,26.827681,4.542524,7.619885,0.518767,7.151293,231.162595,193.407996,79.028477,294.630909,20.613267,7.118696,2.592424
2,17.795882,19.555085,7.844988,1.826335,9.673574,284.181858,153.322762,69.772292,83.638324,7.356249,5.895077,7.119548
3,33.800817,12.209266,8.428001,7.435604,8.392554,159.58224,65.366637,68.457491,277.577358,13.991127,8.965106,6.481226
4,20.716974,18.454356,6.310461,0.795498,5.597238,237.602184,92.8697,56.874305,291.19878,13.517609,6.045039,0.014691


In [19]:
# Run the Describe method on the new dataframe
non_binary_df.describe()

Unnamed: 0,BMI,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,MMSE,FunctionalAssessment,ADL
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,27.655697,10.039442,4.920202,4.993138,7.051081,225.197519,124.335944,59.463533,228.281496,14.755132,5.080055,4.982958
std,7.217438,5.75791,2.857191,2.909055,1.763573,42.542233,43.366584,23.139174,101.986721,8.613151,2.892743,2.949775
min,15.008851,0.002003,0.003616,0.009385,4.002629,150.093316,50.230707,20.003434,50.407194,0.005312,0.00046,0.001288
25%,21.611408,5.13981,2.570626,2.458455,5.482997,190.252963,87.195798,39.095698,137.583222,7.167602,2.566281,2.342836
50%,27.823924,9.934412,4.766424,5.076087,7.115646,225.08643,123.342593,59.768237,230.301983,14.44166,5.094439,5.038973
75%,33.869778,15.157931,7.427899,7.558625,8.562521,262.031657,161.733733,78.93905,314.839046,22.161028,7.546981,7.58149
max,39.992767,19.989293,9.987429,9.998346,9.99984,299.993352,199.965665,99.980324,399.941862,29.991381,9.996467,9.999747


Everything looks good. We will now export our clean DataFrame into a csv file for model training. 

In [22]:
clean_df.to_csv('Data/cleaned_alzheimers_dataset')