In [1]:
import numpy as np, pandas as pd
from statsmodels.stats.proportion import proportions_ztest

rng = np.random.default_rng(7)
N = 10_000

segment = rng.choice(['free','basic','pro'], size=N, p=[0.5, 0.3, 0.2])
region  = rng.choice(['NE','SE','MW','W'], size=N, p=[0.2, 0.3, 0.3, 0.2])
age     = rng.normal(30, 8, N).clip(18, 65)

# True pass probability has real segment/region effects (creates risk of bias)
base = 0.40 + (segment=='pro')*0.12 + (segment=='basic')*0.05 + (region=='W')*0.03 + (age>40)*(-0.02)
p = np.clip(base, 0.05, 0.95)
passed = rng.binomial(1, p, N).astype(bool)

df_pop = pd.DataFrame({'segment':segment, 'region':region, 'age':age, 'passed':passed})
pop_rates = df_pop['passed'].mean()
pop_mix   = df_pop['segment'].value_counts(normalize=True).rename('pop_share').sort_index()
pop_mix_reg = df_pop['region'].value_counts(normalize=True).rename('pop_share').sort_index()

print(f"Population pass rate: {pop_rates:.3f}")
pop_mix

Population pass rate: 0.444


segment
basic    0.2964
free     0.4983
pro      0.2053
Name: pop_share, dtype: float64

### 1. Draw a proportional stratified sample of size n = 600

In [2]:
n = 600
# Perform a value_counts with normalize = TRUE on the "segment" column 
draws_valuecounts = df_pop['segment'].value_counts(normalize=True)

# Translates proportions to add up to the sample count -- feel free to print this var out 
draws = (draws_valuecounts * n).round().astype(int)

sample_strat = []
for seg, k in draws.items():
    #None should be the key identified in the for statement 
    sample_strat.append(df_pop[df_pop['segment']==seg].sample(k, random_state=7))
df_strat = pd.concat(sample_strat, ignore_index=True)

### 2.	Representation check (sample vs population shares by segment)

In [3]:
strat_mix = df_strat['segment'].value_counts(normalize=True).rename('sample_share').sort_index()
rep_check = pd.concat([pop_mix, strat_mix], axis=1)

#Should be the sample share minus the pop share within the 'rep_check' dataframe -- don't forget to use the .abs() at the end of the formula
rep_check['abs_diff'] = rep_check['sample_share'].abs() - rep_check['pop_share'].abs()
rep_check

Unnamed: 0_level_0,pop_share,sample_share,abs_diff
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
basic,0.2964,0.296667,0.000267
free,0.4983,0.498333,3.3e-05
pro,0.2053,0.205,-0.0003


### 3. Describe (K5): mean/SD of age, pass rate

In [4]:
desc = df_strat.agg(age_mean=('age','mean'),
                    age_sd=('age','std'),
                    pass_rate=('passed','mean'))
desc


Unnamed: 0,age,passed
age_mean,29.969704,
age_sd,7.429717,
pass_rate,,0.421667


### 4. Quick hypothesis test: Does your sample’s pass rate differ from the population?(One-sample proportion z-test vs pop mean; approximate but intuitive.)

In [5]:
count = df_strat['passed'].sum()
nobs  = len(df_strat)
# H0: p_sample = p_pop  vs  H1: p_sample != p_pop
#Use the count and nobs vars above to fill in Nones
stat, pval = proportions_ztest(count=count, nobs=nobs, value=pop_rates)
print(f"z={stat:.2f}, p={pval:.4f}")

if pval < 0.05:
    print("We have enough evidence to reject the null hypothesis, the sample pass rate is truly different from the population")
else: 
    print("We fail to reject the null hypothesis, the sample pass rate is the same as the population.")


z=-1.12, p=0.2637
We fail to reject the null hypothesis, the sample pass rate is the same as the population.


## Bias & Ethics write-up
- Where could sampling bias arise here and why (hint: region affects true pass probability in the population-generating process)?
    - Sampling bias could arise if the stratified sampling is done incorrectly, for example, if the samples are not properly defined and do not reflect the true population. In this scenario, for example, segment, age, and pass are different in each region. If we stratify a sample and take 50% from each, we could introduce bias. 
- One mitigation you would implement (weighting regions back to population shares or a stratified-cluster design).
    - One mitigation I'd implement is applying post-stratification weighting, adjusting the sample's data so that each region is represented in proportion to its actual population size.
- One fairness metric you would monitor across segments in your sample vs population (e.g., disparate impact ratio = min(group_rate)/max(group_rate) or demographic parity difference). Flag if ratio < 0.80.
    - One metric I'd monitor would be the rate for the true population and for the sample, and see how they behave and change across time. 
- One product decision risk if you deploy insights from a biased sample (tie to business/user impact).
   - Any insight derived from a biased sample would be cost-effective for businesses, as they will spend money on the wrong feature and lose revenues. Also, it will impact users and underrepresented groups and lead to dissatisfaction or disengagement.