# Demographic Analysis of Alzheimer's Disease

This notebook explores how demographic factors like age, gender, education, and geographical location relate to Alzheimer's diagnosis.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"
from plotly.subplots import make_subplots

In [2]:
df = pd.read_csv('alzheimers_prediction_dataset.csv')

df

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,...,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,...,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,...,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,...,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,...,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74278,Russia,60,Female,3,22.6,High,Former,Never,No,No,...,Average,High,Unemployed,Widowed,No,Medium,High,Medium,Rural,No
74279,UK,58,Male,18,30.6,Low,Never,Occasionally,Yes,No,...,Average,Medium,Unemployed,Single,No,Medium,High,High,Rural,No
74280,Spain,57,Female,13,28.2,Medium,Never,Regularly,No,No,...,Healthy,Low,Employed,Single,Yes,High,Low,Low,Rural,No
74281,Brazil,73,Female,7,29.0,Low,Never,Regularly,No,No,...,Healthy,Low,Employed,Widowed,No,Low,Low,High,Rural,No


## Data Overview and Cleaning

Let's first check for missing values and understand our dataset structure.

In [3]:
df.isnull().sum()

Country                                 0
Age                                     0
Gender                                  0
Education Level                         0
BMI                                     0
Physical Activity Level                 0
Smoking Status                          0
Alcohol Consumption                     0
Diabetes                                0
Hypertension                            0
Cholesterol Level                       0
Family History of Alzheimer’s           0
Cognitive Test Score                    0
Depression Level                        0
Sleep Quality                           0
Dietary Habits                          0
Air Pollution Exposure                  0
Employment Status                       0
Marital Status                          0
Genetic Risk Factor (APOE-ε4 allele)    0
Social Engagement Level                 0
Income Level                            0
Stress Levels                           0
Urban vs Rural Living             

In [4]:
df.index

RangeIndex(start=0, stop=74283, step=1)

In [5]:
df.columns

Index(['Country', 'Age', 'Gender', 'Education Level', 'BMI',
       'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption',
       'Diabetes', 'Hypertension', 'Cholesterol Level',
       'Family History of Alzheimer’s', 'Cognitive Test Score',
       'Depression Level', 'Sleep Quality', 'Dietary Habits',
       'Air Pollution Exposure', 'Employment Status', 'Marital Status',
       'Genetic Risk Factor (APOE-ε4 allele)', 'Social Engagement Level',
       'Income Level', 'Stress Levels', 'Urban vs Rural Living',
       'Alzheimer’s Diagnosis'],
      dtype='object')

In [6]:

df.describe()

Unnamed: 0,Age,Education Level,BMI,Cognitive Test Score
count,74283.0,74283.0,74283.0,74283.0
mean,71.964703,9.487514,26.780639,64.654241
std,12.980748,5.75702,4.764679,20.153247
min,50.0,0.0,18.5,30.0
25%,61.0,4.0,22.7,47.0
50%,72.0,9.0,26.8,65.0
75%,83.0,14.0,30.9,82.0
max,94.0,19.0,35.0,99.0


## Age Distribution Analysis

Analyzing how Alzheimer's diagnosis rates vary across different age groups.

In [7]:
df.sort_index()
df.sort_index(ascending=False)
df.sort_index(axis=1)

Unnamed: 0,Age,Air Pollution Exposure,Alcohol Consumption,Alzheimer’s Diagnosis,BMI,Cholesterol Level,Cognitive Test Score,Country,Depression Level,Diabetes,...,Genetic Risk Factor (APOE-ε4 allele),Hypertension,Income Level,Marital Status,Physical Activity Level,Sleep Quality,Smoking Status,Social Engagement Level,Stress Levels,Urban vs Rural Living
0,90,High,Occasionally,No,33.0,Normal,90,Spain,Low,No,...,No,No,Medium,Single,Medium,Poor,Never,Low,High,Urban
1,72,Medium,Never,No,29.9,Normal,65,Argentina,Low,No,...,No,No,Low,Widowed,Medium,Good,Former,High,High,Urban
2,86,Medium,Occasionally,No,22.9,Normal,43,South Africa,High,No,...,No,Yes,Medium,Single,High,Good,Current,Low,High,Rural
3,53,Medium,Regularly,No,31.2,Normal,81,China,Medium,Yes,...,No,No,Medium,Single,Low,Average,Never,High,Low,Rural
4,58,High,Never,No,30.0,Normal,49,Sweden,High,Yes,...,No,No,Medium,Married,High,Poor,Former,Low,High,Rural
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74278,60,High,Never,No,22.6,High,42,Russia,Medium,No,...,No,No,High,Widowed,High,Poor,Former,Medium,Medium,Rural
74279,58,Medium,Occasionally,No,30.6,Normal,42,UK,Low,Yes,...,No,No,High,Single,Low,Poor,Never,Medium,High,Rural
74280,57,Low,Regularly,No,28.2,High,61,Spain,Low,No,...,Yes,No,Low,Single,Medium,Good,Never,High,Low,Rural
74281,73,Low,Regularly,No,29.0,High,97,Brazil,High,No,...,No,No,Low,Widowed,Low,Good,Never,Low,High,Rural


In [8]:
df['Age']

0        90
1        72
2        86
3        53
4        58
         ..
74278    60
74279    58
74280    57
74281    73
74282    57
Name: Age, Length: 74283, dtype: int64

In [9]:
# Age Distribution Analysis
# Create age groups
bins = [0, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90+']
df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

# Calculate diagnosis rate by age group
age_analysis = df.groupby('Age Group')["Alzheimer’s Diagnosis"].apply(lambda x: (x == 'Yes').mean() * 100).reset_index()
age_analysis.columns = ['Age Group', 'Diagnosis Rate (%)']

# Create Plotly bar chart
fig = px.bar(age_analysis, 
             x='Age Group', 
             y='Diagnosis Rate (%)',
             title='Alzheimer\'s Diagnosis Rate by Age Group',
             labels={'Age Group': 'Age Range', 'Diagnosis Rate (%)': 'Diagnosis Rate (%)'},
             color='Diagnosis Rate (%)',
             color_continuous_scale='Viridis')

fig.update_layout(
    title_x=0.5,
    plot_bgcolor='white',
    showlegend=False,
    width=800,
    height=500
)

fig.update_traces(
    texttemplate='%{y:.1f}%',
    textposition='outside'
)

fig.update_xaxes(title_font=dict(size=12), tickfont=dict(size=10))
fig.update_yaxes(title_font=dict(size=12), tickfont=dict(size=10))

fig.show()

# Print the numbers
print('\nDiagnosis rates by age group:')
print(age_analysis)

  age_analysis = df.groupby('Age Group')["Alzheimer’s Diagnosis"].apply(lambda x: (x == 'Yes').mean() * 100).reset_index()



Diagnosis rates by age group:
  Age Group  Diagnosis Rate (%)
0       <30                 NaN
1     30-40                 NaN
2     40-50                 NaN
3     50-60           15.626900
4     60-70           23.890355
5     70-80           47.711942
6     80-90           66.179857
7       90+           65.643275


## Gender Distribution Analysis

Comparing gender distribution between diagnosed and non-diagnosed individuals.

In [10]:
# Calculate gender distribution for diagnosed and non-diagnosed cases
gender_diagnosis = pd.crosstab(df['Gender'], df["Alzheimer’s Diagnosis"])

# Create two subplots
fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=('Gender Distribution - Diagnosed', 'Gender Distribution - Not Diagnosed'),
                    specs=[[{'type':'domain'}, {'type':'domain'}]])

# Add pie charts
fig.add_trace(go.Pie(labels=gender_diagnosis.index, 
                     values=gender_diagnosis['Yes'],
                     name="Diagnosed",
                     marker_colors=['#2ecc71', '#3498db']),
              1, 1)

fig.add_trace(go.Pie(labels=gender_diagnosis.index, 
                     values=gender_diagnosis['No'],
                     name="Not Diagnosed",
                     marker_colors=['#2ecc71', '#3498db']),
              1, 2)

# Update layout
fig.update_layout(
    title_text="Gender Distribution by Diagnosis Status",
    title_x=0.5,
    width=900,
    height=400,
    showlegend=True
)

fig.show()

# Print the numbers
print("\nGender distribution by diagnosis status:")
print(gender_diagnosis)


Gender distribution by diagnosis status:
Alzheimer’s Diagnosis     No    Yes
Gender                             
Female                 21873  15376
Male                   21697  15337


## Education Level Analysis

A simple horizontal bar chart showing Alzheimer's diagnosis rates for different education levels.

In [11]:
# Calculate diagnosis rate by education level
edu_analysis = df.groupby('Education Level')["Alzheimer’s Diagnosis"].apply(lambda x: (x == 'Yes').mean() * 100).reset_index()
edu_analysis.columns = ['Education Level', 'Diagnosis Rate (%)']

# Sort by diagnosis rate
edu_analysis = edu_analysis.sort_values('Diagnosis Rate (%)', ascending=True)

# Create Plotly horizontal bar chart
fig = px.bar(edu_analysis, 
             y='Education Level', 
             x='Diagnosis Rate (%)',
             orientation='h',
             title='Diagnosis Rate by Education Level',
             color='Diagnosis Rate (%)',
             color_continuous_scale='Viridis')

fig.update_layout(
    title_x=0.5,
    plot_bgcolor='white',
    showlegend=False,
    width=900,
    height=600,
    yaxis_title="Years of Education"
)

fig.update_traces(
    texttemplate='%{x:.1f}%',
    textposition='outside'
)

fig.update_xaxes(title_font=dict(size=12), tickfont=dict(size=10))
fig.update_yaxes(title_font=dict(size=12), tickfont=dict(size=10))

fig.show()

# Print the numbers
print('\nDiagnosis rates by education level:')
print(edu_analysis)


Diagnosis rates by education level:
    Education Level  Diagnosis Rate (%)
4                 4           39.187383
13               13           39.519651
3                 3           40.403769
8                 8           40.613233
16               16           40.874730
12               12           40.911557
18               18           40.930106
2                 2           41.089242
14               14           41.207349
1                 1           41.274458
9                 9           41.480485
0                 0           41.607241
10               10           41.709634
5                 5           41.845319
7                 7           41.900187
15               15           41.918389
17               17           41.929399
6                 6           42.169312
19               19           42.609414
11               11           43.744934


## Geographical Distribution Analysis

Visualizing Alzheimer's diagnosis prevalence across different countries.

In [12]:
# Calculate diagnosis rate by country
country_stats = df.groupby('Country')['Alzheimer’s Diagnosis'].apply(lambda x: (x == 'Yes').mean() * 100).reset_index()
country_stats.columns = ['Country', 'Diagnosis Rate (%)']

# Sort by diagnosis rate
country_stats = country_stats.sort_values('Diagnosis Rate (%)', ascending=False)

# Create Plotly choropleth map
fig = px.choropleth(country_stats,
                    locations='Country',
                    locationmode='country names',
                    color='Diagnosis Rate (%)',
                    title='Alzheimer\'s Diagnosis Rate by Country',
                    color_continuous_scale='Viridis')

fig.update_layout(
    title_x=0.5,
    width=900,
    height=500,
    geo=dict(showframe=False,
             showcoastlines=True,
             projection_type='equirectangular'),
    margin=dict(l=0, r=0, t=30, b=0)
)

fig.show()

# Create a bar chart as well for better visualization
fig_bar = px.bar(country_stats,
                 x='Country',
                 y='Diagnosis Rate (%)',
                 title='Alzheimer\'s Diagnosis Rate by Country',
                 color='Diagnosis Rate (%)',
                 color_continuous_scale='Viridis')

fig_bar.update_layout(
    title_x=0.5,
    plot_bgcolor='white',
    showlegend=False,
    width=900,
    height=500,
    xaxis_tickangle=-45
)

fig_bar.update_traces(
    texttemplate='%{y:.1f}%',
    textposition='outside'
)

fig_bar.show()

# Print the numbers
print('\nDiagnosis rates by country:')
print(country_stats)


Diagnosis rates by country:
         Country  Diagnosis Rate (%)
12        Russia           50.449974
7          India           50.334135
14  South Africa           49.521277
2         Brazil           48.580359
10        Mexico           48.471373
15   South Korea           41.586281
13  Saudi Arabia           41.425451
6        Germany           41.082217
18            UK           40.372501
1      Australia           40.269342
0      Argentina           40.203699
5         France           40.053908
16         Spain           39.994592
8          Italy           39.339420
19           USA           38.855088
4          China           38.641425
11        Norway           34.970318
17        Sweden           34.209813
3         Canada           34.141741
9          Japan           33.937617
