# **Diabetes Dataset Exploratory Data Analysis** 
## Notebook step Overview  
1. Load the Cleaned Dataset
2. Dataset Overview 
3. Univariate Analysis 
4. Feature Distributions by Outcome   
5. Bivariate Analysis 
6. Interactive Visualizations     
7. Age Group Analysis 
8. BMI Analysis 
9. Glucose Analysis  
10. Conclusion

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

# Set styling for static plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
sns.set_context("notebook", font_scale=1.2)
sns.set_style("whitegrid")

In [2]:
# 1. Load the Cleaned Dataset
# -----------------------------------------------------

print("Loading cleaned diabetes dataset...")
df = pd.read_csv('C:/Users/hp/Desktop/diabetes-analysis-project/outputs/cleaned_data.csv')

Loading cleaned diabetes dataset...


In [3]:
# 2. Dataset Overview
# -----------------------------------------------------

print("\nDataset Overview:")
print(f"Dataset Shape: {df.shape}")
df.head()


Dataset Overview:
Dataset Shape: (989, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,125.0,96.0,27.0,95.0,22.5,0.262,21,0
1,0,141.0,74.0,32.0,175.0,42.4,0.205,29,1
2,1,115.0,70.0,29.0,96.0,34.503341,0.520052,32,1
3,10,101.0,86.0,37.0,175.0,45.6,1.136,38,1
4,1,96.0,122.0,27.0,95.0,22.4,0.207,27,0


In [5]:
# 3. Univariate Analysis

print("\nUnivariate Analysis:")

# Distribution of target variable
outcome_counts = df['Outcome'].value_counts()
outcome_percentage = outcome_counts / len(df) * 100

print("Target Variable Distribution:")
print(f"Class 0 (No Diabetes): {outcome_counts[0]} records ({outcome_percentage[0]:.2f}%)")
print(f"Class 1 (Diabetes): {outcome_counts[1]} records ({outcome_percentage[1]:.2f}%)")

# Plot target distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='Outcome', data=df, palette=['#3498db', '#e74c3c'])
plt.title('Distribution of Diabetes Outcome', fontsize=16)
plt.ylabel('Count', fontsize=14)
plt.xlabel('Outcome (0: No Diabetes, 1: Diabetes)', fontsize=14)

# Add count and percentage annotations
for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.annotate(f'{int(height)}\n({outcome_percentage[i]:.1f}%)',
                (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=12)

plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/target_distribution.png')
plt.close()


Univariate Analysis:
Target Variable Distribution:
Class 0 (No Diabetes): 494 records (49.95%)
Class 1 (Diabetes): 495 records (50.05%)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.countplot(x='Outcome', data=df, palette=['#3498db', '#e74c3c'])


In [6]:
# 4. Feature Distributions by Outcome

# Create violin plots for each feature by outcome
features = df.columns.drop('Outcome')

plt.figure(figsize=(16, 20))
for i, feature in enumerate(features):
    plt.subplot(4, 2, i+1)
    sns.violinplot(x='Outcome', y=feature, data=df, palette=['#3498db', '#e74c3c'])
    plt.title(f'{feature} by Diabetes Outcome', fontsize=14)
    plt.xlabel('Outcome (0: No Diabetes, 1: Diabetes)', fontsize=12)
    plt.tight_layout()

plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/feature_distributions_by_outcome.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Outcome', y=feature, data=df, palette=['#3498db', '#e74c3c'])

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Outcome', y=feature, data=df, palette=['#3498db', '#e74c3c'])

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Outcome', y=feature, data=df, palette=['#3498db', '#e74c3c'])

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Outcome', y=feature, data=df, palette=['#3498db', '#e74c3c'])

Pas

In [7]:
# 5. Bivariate Analysis

print("\n Bivariate Analysis:")

# Correlation heatmap
plt.figure(figsize=(12, 10))
corr = df.corr()
mask = np.triu(corr)
heatmap = sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', 
                      mask=mask, linewidths=.5, cbar_kws={"shrink": .8})
plt.title('Feature Correlation Heatmap', fontsize=16)
plt.tight_layout()
plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/correlation_heatmap.png')
plt.close()

# Pair plot for key variables
key_vars = ['Glucose', 'BMI', 'Age', 'Insulin', 'Outcome']
pair_plot = sns.pairplot(df[key_vars], hue='Outcome', palette=['#3498db', '#e74c3c'])
plt.suptitle('Pair Plot of Key Variables', y=1.02, fontsize=16)
pair_plot.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/pair_plot.png')
plt.close()


 Bivariate Analysis:


In [10]:
# 6. Interactive Visualizations  

print("\nCreating Interactive Visualizations:")

# Scatter plot matrix with Plotly
def create_interactive_scatter():
    features = ['Glucose', 'BMI', 'Age', 'BloodPressure', 'Insulin']
    fig = px.scatter_matrix(
        df, 
        dimensions=features,
        color='Outcome',
        color_discrete_map={0: '#3498db', 1: '#e74c3c'},
        title="Interactive Scatter Plot Matrix",
        labels={col: col for col in features},
        height=800, width=800
    )
    fig.update_layout(
        title_font_size=20,
        font=dict(size=12)
    )
    fig.write_html('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/interactive/scatter_matrix.html')

# Create interactive box plots
def create_interactive_boxplots():
    features = df.columns.drop('Outcome')
    fig = make_subplots(rows=4, cols=2, subplot_titles=features)
    
    row, col = 1, 1
    for feature in features:
        fig.add_trace(
            go.Box(x=df['Outcome'].astype(str), y=df[feature], name=feature,
                  marker_color='blue' if feature != 'Outcome' else 'yellow'),
            row=row, col=col
        )
        
        if col == 2:
            col = 1
            row += 1
        else:
            col += 1
    
    fig.update_layout(
        height=1200,
        width=900,
        title_text="Feature Distributions by Diabetes Outcome",
        showlegend=False
    )
    fig.write_html('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/interactive/boxplots_by_outcome.html')

# Create radar chart for feature comparison between outcome groups
def create_radar_chart():
    # Calculate mean values for each feature by outcome
    features = df.columns.drop('Outcome')
    mean_by_outcome = df.groupby('Outcome')[features].mean()
    
    # Normalize the data for radar chart
    normalized = (mean_by_outcome - mean_by_outcome.min()) / (mean_by_outcome.max() - mean_by_outcome.min())
    
    # Create radar chart
    fig = go.Figure()
    
    # Add radar chart for non-diabetic (Outcome=0)
    fig.add_trace(go.Scatterpolar(
        r=normalized.iloc[0].values,
        theta=features,
        fill='toself',
        name='No Diabetes',
        line_color='#3498db'
    ))
    
    # Add radar chart for diabetic (Outcome=1)
    fig.add_trace(go.Scatterpolar(
        r=normalized.iloc[1].values,
        theta=features,
        fill='toself',
        name='Diabetes',
        line_color='#e74c3c'
    ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )),
        showlegend=True,
        title='Normalized Feature Comparison by Diabetes Outcome',
        height=600,
        width=800
    )
    
    fig.write_html('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/interactive/radar_chart.html')

# Generate all interactive visualizations
create_interactive_scatter()
create_interactive_boxplots()
create_radar_chart()

print("Interactive visualizations saved to '../../visuals/interactive/' directory")


Creating Interactive Visualizations:
Interactive visualizations saved to '../../visuals/interactive/' directory


In [11]:
# 7. Age Group Analysis

# Create age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 60, 100], labels=['20-30', '30-40', '40-50', '50-60', '60+'])

# Calculate diabetes prevalence by age group
age_prevalence = df.groupby('AgeGroup')['Outcome'].mean() * 100

plt.figure(figsize=(12, 6))
sns.barplot(x=age_prevalence.index, y=age_prevalence.values, palette='viridis')
plt.title('Diabetes Prevalence by Age Group', fontsize=16)
plt.xlabel('Age Group', fontsize=14)
plt.ylabel('Diabetes Prevalence (%)', fontsize=14)
plt.tight_layout()
plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/age_group_analysis.png')
plt.close()






Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.




In [12]:
# 8. BMI Analysis

# Create BMI categories
df['BMI_Category'] = pd.cut(df['BMI'], 
                           bins=[0, 18.5, 25, 30, 100], 
                           labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

bmi_prevalence = df.groupby('BMI_Category')['Outcome'].mean() * 100
bmi_counts = df.groupby('BMI_Category')['Outcome'].count()

plt.figure(figsize=(12, 6))
ax = sns.barplot(x=bmi_prevalence.index, y=bmi_prevalence.values, palette='viridis')
plt.title('Diabetes Prevalence by BMI Category', fontsize=16)
plt.xlabel('BMI Category', fontsize=14)
plt.ylabel('Diabetes Prevalence (%)', fontsize=14)

# Add count annotations
for i, (prev, count) in enumerate(zip(bmi_prevalence, bmi_counts)):
    ax.annotate(f'n={count}\n{prev:.1f}%', 
                (i, prev + 2), 
                ha='center', 
                fontsize=12)

plt.tight_layout()
plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/bmi_analysis.png')
plt.close() 








Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.




In [13]:
# 9. Glucose Level Analysis

# Create glucose level categories
df['Glucose_Category'] = pd.cut(df['Glucose'], 
                               bins=[0, 70, 100, 126, 300], 
                               labels=['Low', 'Normal', 'Prediabetes', 'Diabetes'])

glucose_prevalence = df.groupby('Glucose_Category')['Outcome'].mean() * 100
glucose_counts = df.groupby('Glucose_Category')['Outcome'].count()

plt.figure(figsize=(12, 6))
ax = sns.barplot(x=glucose_prevalence.index, y=glucose_prevalence.values, palette='viridis')
plt.title('Diabetes Prevalence by Glucose Level', fontsize=16)
plt.xlabel('Glucose Level Category', fontsize=14)
plt.ylabel('Diabetes Prevalence (%)', fontsize=14)

# Add count annotations
for i, (prev, count) in enumerate(zip(glucose_prevalence, glucose_counts)):
    ax.annotate(f'n={count}\n{prev:.1f}%', 
                (i, prev + 2), 
                ha='center', 
                fontsize=12)

plt.tight_layout()
plt.savefig('C:/Users/hp/Desktop/diabetes-analysis-project/visuals/static/glucose_analysis.png')
plt.close()








Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.




# 10. EDA Summary

### 📝 EDA Summary:
1. The dataset contains approximately 35% diabetic cases and 65% non-diabetic cases.
2. Glucose levels show the strongest correlation with diabetes outcome.
3. BMI, Age, and Insulin also show notable correlations with diabetes.
4. Glucose, BMI, and Age distributions show clear differences between diabetic and non-diabetic groups.
5. Higher age groups show increasing diabetes prevalence.
6. Higher BMI categories are associated with higher diabetes prevalence.
7. Glucose levels show expected strong association with diabetes diagnosis.
