In [17]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [18]:
diabetes = pd.read_csv('https://raw.githubusercontent.com/RoobyDoobyDoo/CS5530-Assign2/refs/heads/main/Diabetes/diabetes.csv')
display(diabetes)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [19]:
diabetes.info()
diabetes.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


A) Sample observations
- Set a seed
- Take a random sample of 25 observations
- Find the mean Glucose and highest Glucose values
- Compare these stats with the population statistics (use charts)

In [20]:
# Take a random sample using a seed
np.random.seed(42)
samp = diabetes.sample(n=25)
display(samp)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
668,6,98,58,33,190,34.0,0.43,43,0
324,2,112,75,32,0,35.7,0.148,21,0
624,2,108,64,0,0,30.8,0.158,21,0
690,8,107,80,0,0,24.6,0.856,34,0
473,7,136,90,0,0,29.9,0.21,50,0
204,6,103,72,32,190,37.7,0.324,55,0
97,1,71,48,18,76,20.4,0.323,22,0
336,0,117,0,0,0,33.8,0.932,44,0
568,4,154,72,29,126,31.3,0.338,37,0
148,5,147,78,0,0,33.7,0.218,65,0


In [21]:
# Mean and highest Glucose
samp.agg({'Glucose': ['mean', 'max']})

Unnamed: 0,Glucose
mean,116.64
max,183.0


In [22]:
# Need to get the data in a nice df first
datasets = {'Population': diabetes, 'Sample': samp}

# Make list to store df data
comp_list = []

# Calculate our data and put it in the list
for type_name, df in datasets.items():
    stats = df['Glucose'].agg(['mean', 'max'])
    stats = stats.rename(type_name)              # Cols need labeling
    comp_list.append(stats)

# Organize the data we just added
comp = pd.concat(comp_list, axis=1)

# Make it plotly-friendly
comp = (comp.reset_index()
    .melt(id_vars='index', var_name='Type', value_name='Glucose')
    .rename(columns={'index': 'Statistic'}))

# Compare with charts
fig = px.bar(comp, x='Statistic', y='Glucose', color='Type', barmode='group',
             title='Glucose Statistics: Sample vs. Population',
             width=500, height=500)
fig.show()

B) Find the 98th percentile of BMI of the sample and population and compare using charts

In [23]:
# Do similar as before
bmi_list = []

for type_name, df in datasets.items():
    bmi_98 = df['BMI'].quantile(0.98)
    bmi = pd.Series({'98th Percentile': bmi_98}, name=type_name)
    bmi_list.append(bmi)

comp_bmi = pd.concat(bmi_list, axis=1)

comp_bmi = (comp_bmi.reset_index()
    .melt(id_vars='index', var_name='Type', value_name='Value')
    .rename(columns={'index': 'Statistic'}))

fig = px.bar(comp_bmi, x='Statistic', y='Value', color='Type', barmode='group',
             title='Comparison of BMI 98th Percentile: Sample vs Population',
             width=500, height=500
    )
fig.show()


C) Use bootstraping to create 500 camples of 150 each from the population and find:
- the average mean
- standard deviation
- percentile for BloodPressure

Compare this with same statistics from the population for the same variable.

In [24]:
# Make a new list to not affect original population
pop = diabetes.copy()

# A list to keep them all in
bootstrap_samples = []

# Create the 500 samples
for i in range(500):
    sample = pop.sample(n=150, replace=True)
    bootstrap_samples.append(sample)

# A list to keep our stats
bp_list = []

# Retrieve the stats
for sample in bootstrap_samples:
    bpmean = sample['BloodPressure'].mean()
    bpstd = sample['BloodPressure'].std()
    # I assume we want all the usual percentiles, not just 98
    bp25 = sample['BloodPressure'].quantile(0.25)
    bp50 = sample['BloodPressure'].quantile(0.50)
    bp75 = sample['BloodPressure'].quantile(0.75)
    bp98 = sample['BloodPressure'].quantile(0.98)
    bp_list.append([bpmean, bpstd, bp25, bp50, bp75, bp98])

bp_df = pd.DataFrame(bp_list, columns=['Mean', 'Std', '25%', '50%', '75%', '98%'])

# Prepping it for the next part
bp_df['Type'] = 'Bootstrap'

bp_df.head()

Unnamed: 0,Mean,Std,25%,50%,75%,98%,Type
0,70.293333,19.502726,60.5,72.0,80.0,108.12,Bootstrap
1,70.106667,17.969828,62.0,71.0,80.0,106.08,Bootstrap
2,70.46,20.504912,64.0,74.0,82.0,100.16,Bootstrap
3,70.686667,16.339164,64.0,72.0,80.0,94.24,Bootstrap
4,69.38,19.021948,64.0,72.0,79.5,94.08,Bootstrap


In [25]:
# Get the population's stats
pop = diabetes['BloodPressure']
pop_stats = {'Mean': pop.mean(),
             'Std': pop.std(),
             '25%': pop.quantile(0.25),
             '50%': pop.quantile(0.50),
             '75%': pop.quantile(0.75),
             '98%': pop.quantile(0.98)}

pop_df = pd.DataFrame([pop_stats])
pop_df['Type'] = 'Population'
pop_df.head()

Unnamed: 0,Mean,Std,25%,50%,75%,98%,Type
0,69.105469,19.355807,62.0,72.0,80.0,99.32,Population


In [26]:
# Combine them
comp_bp = pd.concat([bp_df, pop_df], ignore_index=True)
comp_bp.tail()

Unnamed: 0,Mean,Std,25%,50%,75%,98%,Type
496,69.006667,19.185949,62.0,72.0,80.0,94.12,Bootstrap
497,71.806667,17.941253,65.25,72.0,80.0,106.0,Bootstrap
498,69.586667,21.567184,64.0,72.0,80.0,104.08,Bootstrap
499,72.086667,16.089905,66.0,74.0,82.0,100.04,Bootstrap
500,69.105469,19.355807,62.0,72.0,80.0,99.32,Population


In [27]:
# Boxplots
for col in comp_bp.columns[:-1]:
  fig = px.box(comp_bp, x='Type', y=col,
               title=f'Comparison of {col}: Bootstrap vs Population',
               width=500, height=500)
  fig.show()

The mean and standard deviation are close but the percentiles varied quite a bit, not that much of a surprise though. Although the population data point was almost always in or by the edges of the box or (25% of the median) of the bootstrap samples.

In [28]:
# Density map
metrics = ['Mean', 'Std', '25%', '50%', '75%', '98%']

for metric in metrics:
    fig = px.histogram(
        bp_df,
        x=metric,
        nbins=30,
        title=f"Bootstrap Distribution of {metric} vs Population",
        width=800, height=500)

    # Add population line
    pop_val = pop_df.iloc[0][metric]
    fig.add_vline(
        x=pop_val,
        line_dash="dash",)

    fig.update_layout(
        xaxis_title='Blood Pressure',
        yaxis_title="Count")

    fig.show()

The bootstrap samples reflected the population data pretty well, even most of the percentile data had something that related to the population data point (like the overlapping amount of counts where the population line was).

We do see a lot more variation in the bootstrap data, but that's a given since that's what its purpose is.

In [29]:
combined_long = comp_bp.melt(
    id_vars='Type',
    value_vars=['Mean','Std','25%','50%','75%','98%'],
    var_name='Statistic',
    value_name='Value')
combined_long.head()

fig = px.box(
    combined_long[combined_long['Type'] == 'Bootstrap'],
    x='Statistic',
    y='Value',
    title='Bootstrap Distributions with Population Values',
    width=800, height=500)

# Add population points
for metric in metrics:
    pop_val = pop_df.iloc[0][metric]
    fig.add_trace(go.Scatter(
        x=[metric],
        y=[pop_val],
        mode='markers',
        marker=dict(size=10),
        name=f"Population - {metric}",
        showlegend=True))

fig.update_layout(showlegend=False)
fig.show()


Overall, looks like a pretty successful test with sampling the data.