In [4]:
import os
import kagglehub
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("AlzheimersDataset").getOrCreate()

# Download the latest version of the dataset
dataset_path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")

print("Dataset path:", dataset_path)

# List files in the dataset directory
files = os.listdir(dataset_path)
print("Files in dataset:", files)

# Choose the correct CSV file (replace 'your_file.csv' with the actual file name)
csv_file = [f for f in files if f.endswith('.csv')]
if not csv_file:
    raise FileNotFoundError("No CSV file found in the dataset directory.")
csv_file_path = os.path.join(dataset_path, csv_file[0])

# Load the CSV file into a Spark DataFrame
df_spark = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Display the first few rows
df_spark.show(5)

Dataset path: C:\Users\qjone\.cache\kagglehub\datasets\rabieelkharoua\alzheimers-disease-dataset\versions\1
Files in dataset: ['alzheimers_disease_data.csv']
+---------+---+------+---------+--------------+------------------+-------+------------------+-----------------+------------------+-----------------+-----------------------+---------------------+--------+----------+----------+------------+----------+-----------+------------------+-----------------+-----------------+------------------------+------------------+--------------------+----------------+------------------+--------------------+---------+--------------+------------------+-------------------------+-------------+---------+--------------+
|PatientID|Age|Gender|Ethnicity|EducationLevel|               BMI|Smoking|AlcoholConsumption| PhysicalActivity|       DietQuality|     SleepQuality|FamilyHistoryAlzheimers|CardiovascularDisease|Diabetes|Depression|HeadInjury|Hypertension|SystolicBP|DiastolicBP|  CholesterolTotal|   Cholesterol

In [5]:
df = df_spark.toPandas()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int32  
 1   Age                        2149 non-null   int32  
 2   Gender                     2149 non-null   int32  
 3   Ethnicity                  2149 non-null   int32  
 4   EducationLevel             2149 non-null   int32  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int32  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int32  
 12  CardiovascularDisease      2149 non-null   int32  
 13  Diabetes                   2149 non-null   int32

In [7]:
all_columns = df.columns.tolist()

for column in all_columns:
    print(f'These are the value counts for the column named: {column}')
    print(df[column].value_counts())

These are the value counts for the column named: PatientID
PatientID
4751    1
6179    1
6193    1
6192    1
6191    1
       ..
5462    1
5461    1
5460    1
5459    1
6899    1
Name: count, Length: 2149, dtype: int64
These are the value counts for the column named: Age
Age
88    84
68    84
72    82
76    81
71    80
90    79
67    77
60    74
70    74
66    73
89    72
77    72
78    72
84    71
83    71
62    70
63    69
80    68
61    68
87    68
82    68
73    66
65    64
75    64
69    63
64    59
79    57
85    57
81    57
74    55
86    50
Name: count, dtype: int64
These are the value counts for the column named: Gender
Gender
1    1088
0    1061
Name: count, dtype: int64
These are the value counts for the column named: Ethnicity
Ethnicity
0    1278
1     454
3     211
2     206
Name: count, dtype: int64
These are the value counts for the column named: EducationLevel
EducationLevel
1    854
2    636
0    446
3    213
Name: count, dtype: int64
These are the value counts for the

In [8]:
df.describe()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,5825.0,74.908795,0.506282,0.697534,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,620.507185,8.990221,0.500077,0.996128,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,4751.0,60.0,0.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,5288.0,67.0,0.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,5825.0,75.0,1.0,0.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,6362.0,83.0,1.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,6899.0,90.0,1.0,3.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int32  
 1   Age                        2149 non-null   int32  
 2   Gender                     2149 non-null   int32  
 3   Ethnicity                  2149 non-null   int32  
 4   EducationLevel             2149 non-null   int32  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int32  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int32  
 12  CardiovascularDisease      2149 non-null   int32  
 13  Diabetes                   2149 non-null   int32