In [1]:
import pandas as pd
from src.simulation import simulate_data
from src.preprocessing import preprocess_dataframe
from src.models import fit_logistic_regression, extract_gender_effect

In [2]:
# Simulator A
df_null = simulate_data(n_samples=5000, bias=False, seed=42)

# Simulator B
df_biased = simulate_data(n_samples=5000, bias=True, seed=43)

df_null.head(), df_biased.head()

(   gender education    occupation  income
 0    Male      high  professional       1
 1    Male       low        manual       0
 2  Female      high        manual       0
 3  Female       low        manual       0
 4    Male    medium        manual       0,
   gender education    occupation  income
 0   Male    medium        manual       0
 1   Male      high  professional       1
 2   Male    medium        manual       0
 3   Male       low       service       0
 4   Male       low        manual       0)

In [3]:
# Preprocessing - sanity check
X_sim, y_sim = preprocess_dataframe(df_null)

X_sim.head(), y_sim.head()

(   gender  education_low  education_medium  occupation_professional  \
 0       0            0.0               0.0                      1.0   
 1       0            1.0               0.0                      0.0   
 2       1            0.0               0.0                      0.0   
 3       1            1.0               0.0                      0.0   
 4       0            0.0               1.0                      0.0   
 
    occupation_service  
 0                 0.0  
 1                 0.0  
 2                 0.0  
 3                 0.0  
 4                 0.0  ,
 0    1
 1    0
 2    0
 3    0
 4    0
 Name: income, dtype: int64)

In [4]:
# Fit model on simulated data
model = fit_logistic_regression(X_sim, y_sim)

# Extract gender effect
gender_info = extract_gender_effect(model)
gender_info

{'coef': np.float64(-0.04302837754710032),
 'p_value': np.float64(0.5119521972702383),
 'ci_lower': -0.17162542935049024,
 'ci_upper': 0.08556867425628961,
 'odds_ratio': np.float64(0.9578842072771334)}

In [5]:
df_null = simulate_data(n_samples=5000, bias=False, seed=0)
X_null, y_null = preprocess_dataframe(df_null)

model_null = fit_logistic_regression(X_null, y_null)
extract_gender_effect(model_null)

{'coef': np.float64(0.0012870549613181181),
 'p_value': np.float64(0.9842280487473706),
 'ci_lower': -0.12631907383293414,
 'ci_upper': 0.12889318375557038,
 'odds_ratio': np.float64(1.0012878835720058)}

In [6]:
df_biased = simulate_data(n_samples=5000, bias=True, seed=0)
X_biased, y_biased = preprocess_dataframe(df_biased)

model_biased = fit_logistic_regression(X_biased, y_biased)
extract_gender_effect(model_biased)

{'coef': np.float64(-0.6874004327015435),
 'p_value': np.float64(1.4512439900071942e-23),
 'ci_lower': -0.8220632494314553,
 'ci_upper': -0.5527376159716317,
 'odds_ratio': np.float64(0.5028816460452682)}

### Simulation-Based Validation

In [7]:
n_simulations = 200 # number of repetitions
n_samples = 1000  # sample size
alpha = 0.05 # significance level

In [8]:
false_positives = 0

for i in range(n_simulations):
    df = simulate_data(n_samples=n_samples, bias=False, seed=i)

    X, y = preprocess_dataframe(df)
    model = fit_logistic_regression(X, y)
    result = extract_gender_effect(model)

    if result["p_value"] < alpha:
        false_positives += 1

type_1_error = false_positives / n_simulations
print("Type I error rate:", type_1_error)

Type I error rate: 0.05


In [9]:
true_positives = 0

for i in range(n_simulations):
    df = simulate_data(n_samples=n_samples, bias=True, seed=i)

    X, y = preprocess_dataframe(df)
    model = fit_logistic_regression(X, y)
    result = extract_gender_effect(model)

    if result["p_value"] < alpha:
        true_positives += 1

power = true_positives / n_simulations
print("Power:", power)

Power: 0.99


In [10]:
# Result table
sample_sizes = [200, 500, 1000, 5000]
n_simulations = 200
alpha = 0.05

In [11]:
results = []

for n_samples in sample_sizes:

    # Type I Error on zero hypothesis
    false_positives = 0

    for i in range(n_simulations):
        df = simulate_data(n_samples=n_samples, bias=False, seed=i)

        X, y = preprocess_dataframe(df)
        model = fit_logistic_regression(X, y)
        result = extract_gender_effect(model)

        if result["p_value"] < alpha:
            false_positives += 1

    type_1_error = false_positives / n_simulations

    # Power on alternative hypothesis
    true_positives = 0

    for i in range(n_simulations):
        df = simulate_data(n_samples=n_samples, bias=True, seed=i)

        X, y = preprocess_dataframe(df)
        model = fit_logistic_regression(X, y)
        result = extract_gender_effect(model)

        if result["p_value"] < alpha:
            true_positives += 1

    power = true_positives / n_simulations

    results.append({"Sample size": n_samples, "Type I error": type_1_error, "Power": power})

In [12]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Sample size,Type I error,Power
0,200,0.045,0.525
1,500,0.055,0.92
2,1000,0.05,0.99
3,5000,0.06,1.0


### Test on real Adult dataset

In [13]:
adult_df = pd.read_csv("data/adult.csv")
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [14]:
adult_df = adult_df[["gender", "education", "occupation", "income"]].dropna()

In [15]:
# Preprocessing
X_adult, y_adult = preprocess_dataframe(adult_df)

X_adult.head(), y_adult.head()

(   gender  education_11th  education_12th  education_1st-4th  \
 0       0             1.0             0.0                0.0   
 1       0             0.0             0.0                0.0   
 2       0             0.0             0.0                0.0   
 3       0             0.0             0.0                0.0   
 4       1             0.0             0.0                0.0   
 
    education_5th-6th  education_7th-8th  education_9th  education_Assoc-acdm  \
 0                0.0                0.0            0.0                   0.0   
 1                0.0                0.0            0.0                   0.0   
 2                0.0                0.0            0.0                   1.0   
 3                0.0                0.0            0.0                   0.0   
 4                0.0                0.0            0.0                   0.0   
 
    education_Assoc-voc  education_Bachelors  ...  occupation_Farming-fishing  \
 0                  0.0                

In [16]:
# Model + Result
adult_model = fit_logistic_regression(X_adult, y_adult)
adult_gender_effect = extract_gender_effect(adult_model)

adult_gender_effect

{'coef': np.float64(-1.3242668727975226),
 'p_value': np.float64(0.0),
 'ci_lower': -1.3862663439897942,
 'ci_upper': -1.262267401605251,
 'odds_ratio': np.float64(0.26599789791748146)}