In [2]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

# Set the default template for better looking plots
import plotly.io as pio
pio.templates.default = "plotly_white"

In [3]:
df = pd.read_csv('alzheimers_prediction_dataset.csv')

df

Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,...,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
0,Spain,90,Male,1,33.0,Medium,Never,Occasionally,No,No,...,Healthy,High,Retired,Single,No,Low,Medium,High,Urban,No
1,Argentina,72,Male,7,29.9,Medium,Former,Never,No,No,...,Healthy,Medium,Unemployed,Widowed,No,High,Low,High,Urban,No
2,South Africa,86,Female,19,22.9,High,Current,Occasionally,No,Yes,...,Average,Medium,Employed,Single,No,Low,Medium,High,Rural,No
3,China,53,Male,17,31.2,Low,Never,Regularly,Yes,No,...,Healthy,Medium,Retired,Single,No,High,Medium,Low,Rural,No
4,Sweden,58,Female,3,30.0,High,Former,Never,Yes,No,...,Unhealthy,High,Employed,Married,No,Low,Medium,High,Rural,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74278,Russia,60,Female,3,22.6,High,Former,Never,No,No,...,Average,High,Unemployed,Widowed,No,Medium,High,Medium,Rural,No
74279,UK,58,Male,18,30.6,Low,Never,Occasionally,Yes,No,...,Average,Medium,Unemployed,Single,No,Medium,High,High,Rural,No
74280,Spain,57,Female,13,28.2,Medium,Never,Regularly,No,No,...,Healthy,Low,Employed,Single,Yes,High,Low,Low,Rural,No
74281,Brazil,73,Female,7,29.0,Low,Never,Regularly,No,No,...,Healthy,Low,Employed,Widowed,No,Low,Low,High,Rural,No


In [18]:
# First prepare the data as before
grouped_data = df.groupby(['Smoking Status', 'Alcohol Consumption', 'Alzheimer’s Diagnosis']).size().reset_index(name='Count')

# Create the Plotly figure
fig = px.bar(grouped_data, 
             x='Smoking Status',
             y='Count',
             color='Alzheimer’s Diagnosis',
             barmode='group',
             color_discrete_sequence=px.colors.qualitative.Set2,
             title='Alzheimers Diagnosis by Smoking Status and Alcohol Consumption',
             labels={'Count': 'Number of Individuals',
                    'Smoking Status': 'Smoking Status',
                    'Alzheimer’s Diagnosis': 'Diagnosis'})

# Update layout
fig.update_layout(
    title_x=0.5,
    plot_bgcolor='white',
    width=900,
    height=500,
    legend=dict(
        title='Alzheimer’s Diagnosis',
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99
    )
)

# Update axes
fig.update_xaxes(
    tickangle=45,
    gridcolor='lightgray'
)
fig.update_yaxes(
    gridcolor='lightgray'
)

# Add value labels on the bars
fig.update_traces(
    texttemplate='%{y}',
    textposition='outside'
)

# Show the plot
fig.show()

In [10]:
# Create box plot using Plotly Express
fig = px.box(df, x='Alzheimer’s Diagnosis', y='BMI',
             color='Alzheimer’s Diagnosis',
             color_discrete_sequence=['#1f77b4', '#ff7f0e'],
             title='BMI Distribution by Alzheimers Diagnosis',
             labels={'Alzheimers Diagnosis': 'Diagnosis'})

# Update layout
fig.update_layout(
    title_x=0.5,
    plot_bgcolor='white',
    showlegend=False,
    width=800,
    height=500
)

# Update axes
fig.update_xaxes(gridcolor='lightgray', 
                 ticktext=['No', 'Yes'], 
                 tickvals=[0, 1])
fig.update_yaxes(gridcolor='lightgray')

fig.show()

## Physical Activity Levels and Alzheimer's Analysis

Let's analyze how physical activity levels relate to Alzheimer's diagnosis using a clear visualization that shows both counts and percentages.

In [14]:
# First calculate the percentages as before
activity_percentages = pd.DataFrame()
for level in ['Low', 'Medium', 'High']:
    total = activity_ct.loc[level, 'No'] + activity_ct.loc[level, 'Yes']
    activity_percentages.loc[level, 'No (%)'] = (activity_ct.loc[level, 'No'] / total) * 100
    activity_percentages.loc[level, 'Yes (%)'] = (activity_ct.loc[level, 'Yes'] / total) * 100

# Create figure with secondary y-axis
fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=('Number of Individuals by Physical Activity Level',
                                  'Percentage Distribution by Physical Activity Level'))

# Add bars for count plot (left subplot)
fig.add_trace(
    go.Bar(name='No', x=activity_ct.index, y=activity_ct['No'],
           text=activity_ct['No'].apply(lambda x: f'{x:,}'),
           textposition='inside',
           marker_color='#2ecc71'),
    row=1, col=1
)

fig.add_trace(
    go.Bar(name='Yes', x=activity_ct.index, y=activity_ct['Yes'],
           text=activity_ct['Yes'].apply(lambda x: f'{x:,}'),
           textposition='inside',
           marker_color='#e74c3c'),
    row=1, col=1
)

# Add bars for percentage plot (right subplot)
fig.add_trace(
    go.Bar(name='No', x=activity_percentages.index, y=activity_percentages['No (%)'],
           text=activity_percentages['No (%)'].apply(lambda x: f'{x:.1f}%'),
           textposition='inside',
           marker_color='#2ecc71',
           showlegend=False),
    row=1, col=2
)

fig.add_trace(
    go.Bar(name='Yes', x=activity_percentages.index, y=activity_percentages['Yes (%)'],
           text=activity_percentages['Yes (%)'].apply(lambda x: f'{x:.1f}%'),
           textposition='inside',
           marker_color='#e74c3c',
           showlegend=False),
    row=1, col=2
)

# Update layout
fig.update_layout(
    title_text="Physical Activity Levels and Alzheimer's Diagnosis Analysis",
    title_x=0.5,
    barmode='stack',
    height=600,
    width=1200,
    showlegend=True,
    legend_title="Alzheimer's Diagnosis",
    plot_bgcolor='white'
)

# Update axes
fig.update_xaxes(title_text="Physical Activity Level", row=1, col=1, gridcolor='lightgray')
fig.update_xaxes(title_text="Physical Activity Level", row=1, col=2, gridcolor='lightgray')
fig.update_yaxes(title_text="Number of Individuals", row=1, col=1, gridcolor='lightgray')
fig.update_yaxes(title_text="Percentage (%)", row=1, col=2, gridcolor='lightgray')

# Show plot
fig.show()

# Print summary statistics
print("\nSummary of Physical Activity Levels and Alzheimer's Diagnosis:")
print("\nRaw Numbers:")
print(activity_ct)
print("\nPercentages:")
print(activity_percentages.round(1))


Summary of Physical Activity Levels and Alzheimer's Diagnosis:

Raw Numbers:
Alzheimer’s Diagnosis       No    Yes
Physical Activity Level              
Low                      14536  10218
Medium                   14509  10167
High                     14525  10328

Percentages:
        No (%)  Yes (%)
Low       58.7     41.3
Medium    58.8     41.2
High      58.4     41.6


### Key Observations:

1. **Distribution Pattern**:
   - Each activity level (Low, Medium, High) shows similar patterns
   - Approximately 58-59% of people in each activity level do not have Alzheimer's
   - About 41-42% of people in each activity level have Alzheimer's

2. **Activity Level Comparison**:
   - Low Activity: Total ~24,754 individuals
   - Medium Activity: Total ~24,676 individuals
   - High Activity: Total ~24,853 individuals

3. **Important Note**:
   - The similar distribution across all activity levels suggests that physical activity alone
     may not be a determining factor for Alzheimer's diagnosis in this dataset
   - Other factors like age, genetics, or overall health might play more significant roles

## Multi-Factor Analysis: Physical Activity, Age, and Health Indicators

Let's analyze how physical activity interacts with other factors in relation to Alzheimer's diagnosis.

In [16]:
# First create the age groups as before
df['Age_Group'] = pd.cut(df['Age'], 
                        bins=[0, 50, 60, 70, 80, 90, 100],
                        labels=['<50', '50-60', '60-70', '70-80', '80-90', '90+'])

# Create the subplot layout
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Age Distribution by Physical Activity Level and Diagnosis',
                   'BMI Distribution by Physical Activity Level and Diagnosis',
                   'Alzheimer\'s Diagnosis Rate by Age Group and Physical Activity Level'),
    specs=[[{}, {}],
           [{"colspan": 2}, None]],
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

# 1. Age Distribution by Physical Activity and Diagnosis (Box Plot)
for i, diagnosis in enumerate(['No', 'Yes']):
    for j, activity in enumerate(['Low', 'Medium', 'High']):
        subset = df[(df['Alzheimer’s Diagnosis'] == diagnosis) & 
                   (df['Physical Activity Level'] == activity)]
        
        fig.add_trace(
            go.Box(x=[activity] * len(subset),
                  y=subset['Age'],
                  name=diagnosis,
                  legendgroup=diagnosis,
                  showlegend=True if j == 0 else False,
                  marker_color='#2ecc71' if diagnosis == 'No' else '#e74c3c'),
            row=1, col=1
        )

# 2. BMI Distribution by Physical Activity and Diagnosis (Box Plot)
for i, diagnosis in enumerate(['No', 'Yes']):
    for j, activity in enumerate(['Low', 'Medium', 'High']):
        subset = df[(df['Alzheimer’s Diagnosis'] == diagnosis) & 
                   (df['Physical Activity Level'] == activity)]
        
        fig.add_trace(
            go.Box(x=[activity] * len(subset),
                  y=subset['BMI'],
                  name=diagnosis,
                  legendgroup=diagnosis,
                  showlegend=False,
                  marker_color='#2ecc71' if diagnosis == 'No' else '#e74c3c'),
            row=1, col=2
        )

# 3. Diagnosis Rate by Age Group and Physical Activity (Bar Plot)
diagnosis_by_age_activity = df.groupby(['Age_Group', 'Physical Activity Level'])['Alzheimer’s Diagnosis'].apply(
    lambda x: (x == 'Yes').mean() * 100
).reset_index()

colors = px.colors.qualitative.Set2
for i, activity in enumerate(['Low', 'Medium', 'High']):
    activity_data = diagnosis_by_age_activity[diagnosis_by_age_activity['Physical Activity Level'] == activity]
    
    fig.add_trace(
        go.Bar(x=activity_data['Age_Group'],
               y=activity_data['Alzheimer’s Diagnosis'],
               name=activity,
               text=activity_data['Alzheimer’s Diagnosis'].round(1).astype(str) + '%',
               textposition='outside',
               marker_color=colors[i]),
        row=2, col=1
    )

# Update layout and formatting
fig.update_layout(
    title_text='Multi-Factor Analysis of Alzheimer\'s Diagnosis',
    title_x=0.5,
    height=900,
    width=1200,
    showlegend=True,
    legend_title='Diagnosis Status',
    barmode='group',
    plot_bgcolor='white'
)

# Update axes labels and formatting
fig.update_xaxes(title_text='Physical Activity Level', row=1, col=1, gridcolor='lightgray')
fig.update_xaxes(title_text='Physical Activity Level', row=1, col=2, gridcolor='lightgray')
fig.update_xaxes(title_text='Age Group', row=2, col=1, gridcolor='lightgray')

fig.update_yaxes(title_text='Age', row=1, col=1, gridcolor='lightgray')
fig.update_yaxes(title_text='BMI', row=1, col=2, gridcolor='lightgray')
fig.update_yaxes(title_text='Diagnosis Rate (%)', row=2, col=1, gridcolor='lightgray')

# Show the plot
fig.show()

# Print statistical summary
print('\nStatistical Summary by Physical Activity Level:')
for level in ['Low', 'Medium', 'High']:
    print(f"\n{level} Activity Level:")
    subset = df[df['Physical Activity Level'] == level]
    print(f"Average Age: {subset['Age'].mean():.1f} years")
    print(f"Average BMI: {subset['BMI'].mean():.1f}")
    print(f"Diagnosis Rate: {(subset['Alzheimer’s Diagnosis'] == 'Yes').mean() * 100:.1f}%")






Statistical Summary by Physical Activity Level:

Low Activity Level:
Average Age: 71.9 years
Average BMI: 26.7
Diagnosis Rate: 41.3%

Medium Activity Level:
Average Age: 72.0 years
Average BMI: 26.8
Diagnosis Rate: 41.2%

High Activity Level:
Average Age: 72.0 years
Average BMI: 26.8
Diagnosis Rate: 41.6%


### Key Findings from Multi-Factor Analysis:

1. **Age Impact**:
   - Age shows a stronger correlation with Alzheimer's diagnosis than physical activity
   - The diagnosis rate increases significantly with age across all activity levels

2. **Physical Activity and Age Interaction**:
   - Within each age group, physical activity levels show minimal variation in diagnosis rates
   - The effect of age appears to be more dominant than physical activity

3. **BMI Patterns**:
   - BMI distributions show slight variations across activity levels
   - The relationship between BMI and diagnosis appears less pronounced than age

4. **Overall Pattern**:
   - Age emerges as the strongest predictor among these factors
   - Physical activity alone may not significantly influence Alzheimer's risk
   - A more holistic approach considering multiple factors may be needed for risk assessment