1. Import libraries


In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

2. Load dataset

In [3]:
df = pd.read_csv("./dataset/alzheimers_disease_data.csv")
df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,3.785399,0,0,0,0,1,1,XXXConfid


3. Explore data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [5]:
df.isnull().sum()

PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.shape

(2149, 35)

In [8]:
df['DoctorInCharge'].unique()

array(['XXXConfid'], dtype=object)

In [9]:
#since all the cases have the same doctor in charge so we do not need that column
df.drop(columns='DoctorInCharge',inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [11]:
numerical_columns = ['Age', 'BMI', 'AlcoholConsumption',
                     'PhysicalActivity', 'SystolicBP', 'DiastolicBP', 'CholesterolToTal',
                     'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE']

In [12]:
# Detecting outliers using Plotly
fig = px.box(df, y=numerical_columns,title="Outlier Detection in Numerical Columns")
fig.show()

As we can see, data does not have outliers

4. Visualization using plotly

Demographic details:
- Age: The age of the patients ranges from 60 to 90 years.
- Gender: Gender of the patients, where 0 represents Male and 1 represents Female.
- Ethnicity: The ethnicity of the patients, coded as follows: 0: Caucasian, 1: African American, 2: Asian, 3: Other
- Education Level: The education level of the patients, coded as follows:0: None, 1: High School, 2: Bachelor's, 3: Higher


In [13]:
#distribution of each gender with alzhimer
fig1=px.bar(df,x='Gender' ,y='Diagnosis')
fig1.show()

In [14]:
#count each gender with and withous alzhimer
fig = px.histogram(df, x='Gender', color='Diagnosis', barmode='group')
fig.show()

This figure shows that diffrence between number of male patient and female patient is not so big but still male are more likely to have alzheimer

In [15]:
#distribution of each ethinicity with alzhimer
fig2=px.scatter(df,y='Age',color='Diagnosis',x='Ethnicity')
fig2.show()

This means Caucasian and Asian have alzheimer more than others.

Life style factor

- BMI: Body Mass Index of the patients, ranging from 15 to 40.
- Smoking: Smoking status, where 0 indicates No and 1 indicates Yes.
- Alcohol Consumption: Weekly alcohol consumption in units, ranging from 0 to 20.
- Physical Activity: Weekly physical activity in hours, ranging from 0 to 10.
- Diet Quality: Diet quality score, ranging from 0 to 10.
- Sleep Quality: Sleep quality score, ranging from 4 to 10.

In [16]:
#BMI VS smoking or not and having alzheimer or not
fig3=px.histogram(df,x='Smoking',y='BMI',color='Diagnosis')
fig3.show()

From this chart:
- People that do not smoke has higher body mass than smoking people
- People that suffer from alzheimer have higher body mass than others


In [17]:
#relation between alchohol consumption and alzheimer
fig4=px.bar(df,x='Diagnosis',y='AlcoholConsumption')
fig4.show()

Now we can confirm that people who drink alcohol most of the time are more likely to have alzhimer than other .



In [18]:
#relation between physical activity and alzhimer
fig5=px.bar(df,y='PhysicalActivity',x='Diagnosis')
fig5.show()

People that practice and do physical activities are less susceptible to alzheimer than others



Medical History:
- FamilyHistoryAlzheimers: Family history of Alzheimer's Disease, where 0 indicates No and 1 indicates Yes.
- CardiovascularDisease: Presence of cardiovascular disease, where 0 indicates No and 1 indicates Yes.
- Diabetes: Presence of diabetes, where 0 indicates No and 1 indicates Yes.
- Depression: Presence of depression, where 0 indicates No and 1 indicates Yes.
- HeadInjury: History of head injury, where 0 indicates No and 1 indicates Yes.
- Hypertension: Presence of hypertension, where 0 indicates No and 1 indicates Yes

In [19]:
#patient with Family History with Alzheimers
fig6 = px.histogram(df, x='FamilyHistoryAlzheimers', color='Diagnosis', barmode='group',title='Patients with Family History with Alzheimers')
fig6.show()

In [20]:
#percentage of patient with Family History with Alzheimers
family_history_percentage = df['FamilyHistoryAlzheimers'].value_counts(normalize=True) * 100
family_history_df = family_history_percentage.reset_index()
family_history_df.columns = ['FamilyHistoryAlzheimers', 'Percentage']
family_history_df['FamilyHistoryAlzheimers'] = family_history_df['FamilyHistoryAlzheimers'].map({0: 'No', 1: 'Yes'})

fig = px.pie(family_history_df,
             names='FamilyHistoryAlzheimers',
             values='Percentage',
             title='Percentage of Family History of Alzheimer\'s Disease')
fig.show()

25.2% of patients have a family history of Alzheimer's, indicating a potential genetic predisposition.

In [33]:
#percentage of patient with Cardiovascular Disease with Alzheimer
cardio_percentage = df['CardiovascularDisease'].value_counts(normalize=True) * 100
cardio_df = cardio_percentage.reset_index()
cardio_df.columns = ['CardiovascularDisease', 'Percentage']

cardio_df['CardiovascularDisease'] = cardio_df['CardiovascularDisease'].map({0: 'No', 1: 'Yes'})

fig7= px.pie(cardio_df,
             names='CardiovascularDisease',
             values='Percentage',
             title='Percentage of Cardiovascular Disease')
fig7.show()

Only 14.4% of people with Cardiovascular disease have alzheimer so these two disease are not too related

In [34]:
#percentage of patient with Diabetes disease with Alzheimer
diabetes_percentage = df['Diabetes'].value_counts(normalize=True) * 100
diabetes_df = diabetes_percentage.reset_index()
diabetes_df.columns = ['labels', 'values']
diabetes_df['labels'] = diabetes_df['labels'].map({0: 'No', 1: 'Yes'})
fig8 = px.pie(diabetes_df,
              names='labels',
              values='values',
              title='Percentage of Diabetes Disease')
fig8.show()

Again, Diabetes and Alzheimer's are not too closely related

In [35]:
fig9 = px.histogram(df, x='Depression', color='Diagnosis', barmode='group',title='Patient with Depression')
fig9.show()


Having Depression and alzhimer are not too related to each other

In [36]:
#percentage of patient with Head Injury with Alzheimer

head_injury_percentage = df['HeadInjury'].value_counts(normalize=True) * 100
head_injury_df = head_injury_percentage.reset_index()
head_injury_df.columns = ['HeadInjury', 'Percentage']
head_injury_df['HeadInjury'] = head_injury_df['HeadInjury'].map({0: 'No', 1: 'Yes'})
fig10= px.pie(head_injury_df,
              names='HeadInjury',
              values='Percentage',
              title='Percentage of Head Injury')
fig10.show()

Percentage of people that have head injury and then have alzheimer is 9.26%, which is so low, and that means it is not too related to each other

In [25]:
#percentage of patient with Hypertension and with alzhimer

hypertension_percentage = df['Hypertension'].value_counts(normalize=True) * 100
hypertension_df = hypertension_percentage.reset_index()
hypertension_df.columns = ['Hypertension', 'Percentage']
hypertension_df['Hypertension'] = hypertension_df['Hypertension'].map({0: 'No', 1: 'Yes'})
fig11 = px.pie(hypertension_df,
               names='Hypertension',
               values='Percentage',
               title='Percentage of Hypertension')
fig11.show()

14.9% is the second highest number after the relationship between diabetes and Alzheimer's

Clinical Measurements

- SystolicBP: Systolic blood pressure, ranging from 90 to 180 mmHg.
- DiastolicBP: Diastolic blood pressure, ranging from 60 to 120 mmHg.
- CholesterolTotal: Total cholesterol levels, ranging from 150 to 300 mg/dL.
- CholesterolLDL: Low-density lipoprotein cholesterol levels, ranging from 50 to 200 mg/dL.
- CholesterolHDL: High-density lipoprotein cholesterol levels, ranging from 20 to 100 mg/dL.
- CholesterolTriglycerides: Triglycerides levels, ranging from 50 to 400 mg/dL

In [26]:
#average of SystolicBP ,DiastolicBP between patient with and withoout Alzheimer
avg_bp = df.groupby('Diagnosis')[['SystolicBP', 'DiastolicBP']].mean().reset_index()
fig12 = make_subplots(rows=1, cols=2, subplot_titles=('Average Systolic Blood Pressure', 'Average Diastolic Blood Pressure'))
fig12.add_trace(go.Bar(x=avg_bp['Diagnosis'], y=avg_bp['SystolicBP'], name='SystolicBP'), row=1, col=1)
fig12.add_trace(go.Bar(x=avg_bp['Diagnosis'], y=avg_bp['DiastolicBP'], name='DiastolicBP'), row=1, col=2)
fig12.show()

There is no significant difference in average systolic and diastolic blood pressure between patients with and without Alzheimer's.

In [27]:
#Relationship Between Age and Cholesterol Levels Among Patients with Alzheimer's
cholesterol_columns = ['CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL']
fig13 = make_subplots(rows=1, cols=3, subplot_titles=cholesterol_columns)
for i, column in enumerate(cholesterol_columns, 1):
    fig13.add_trace(go.Scatter(x=df[df['Diagnosis'] == 1]['Age'], y=df[df['Diagnosis'] == 1][column], mode='markers', name=column), row=1, col=i)
    fig13.update_xaxes(title_text='Age', row=1, col=i)
    fig13.update_yaxes(title_text=column, row=1, col=i)
fig13.update_layout(title="Relationship Between Age and Cholesterol Levels Among Patients with Alzheimer's")
fig13.show()

There is no clear relationship between age and cholesterol levels (total, LDL, HDL) among patients with Alzheimer's.

Cognitive and Functional Assessments

- MMSE: Mini-Mental State Examination score, ranging from 0 to 30. Lower scores indicate cognitive impairment.
- FunctionalAssessment: Functional assessment score, ranging from 0 to 10. Lower scores indicate greater impairment.
- MemoryComplaints: Presence of memory complaints, where 0 indicates No and 1 indicates Yes.
- BehavioralProblems: Presence of behavioral problems, where 0 indicates No and 1 indicates Yes.
- ADL: Activities of Daily Living score, ranging from 0 to 10. Lower scores indicate greater impairment.

In [28]:
fig14 = px.box(df, x='Diagnosis', y='FunctionalAssessment')
fig14.show()

Patients with Alzheimer's have lower functional assessment scores compared to those without Alzheimer's.

In [29]:
fig14 = px.histogram(df, x='BehavioralProblems',color='Diagnosis' ,barmode='group')
fig14.show()

People with behaviour problem are more likely to have Alzheimer

In [30]:
fig15 = px.histogram(df, x='MemoryComplaints',color='Diagnosis' ,barmode='group')
fig15.show()

In [31]:
#Distribution of Activities of Daily Living score
fig16 = px.scatter(df, color='Diagnosis', y='ADL')
fig16.show()

People with ADL range from 0 to 4 are more likely to have Alzheimer than peaple with ADL range from 5 to 10

Symptoms

In [32]:
fig17=px.histogram(df,x='Forgetfulness',color='Diagnosis',barmode='group')
fig17.show()

People that forget are more likely suffer from completing tasks and also people that have Alzheimer suffer from forgetting more than pepple that does not have