<a href="https://colab.research.google.com/github/Vaidehi-9/ECON3916-33674-Statistical-Machine-Learning/blob/main/Lab_9/%20Lab_9_The_Architecture_of_Control.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
import scipy.stats as stats

uploaded = files.upload()

df = pd.read_csv('lalonde.csv')

Saving lalonde.csv to lalonde (1).csv


In [13]:
# Naive Comparison
naive_diff = df[df.treat==1]['re78'].mean() - df[df.treat==0]['re78'].mean()
print(f"Naive Difference in Means: ${naive_diff:,.2f}")
# Expected Result: -$635.03

Naive Difference in Means: $-635.03


In [14]:
covariate_cols = ['age', 'educ', 'black', 'hispan', 'married','nodegree', 're74', 're75']

X = df[covariate_cols]
y = df['treat']

logit = LogisticRegression(solver='liblinear', max_iter=1000)
logit.fit(X, y)

df['pscore'] = logit.predict_proba(X)[:, 1]

print("\nPropensity Score Summary:")
print(df.groupby('treat')['pscore'].describe().round(4))


Propensity Score Summary:
       count    mean     std     min     25%     50%     75%     max
treat                                                               
0      429.0  0.1967  0.2262  0.0090  0.0540  0.1072  0.1688  0.7781
1      185.0  0.5619  0.2118  0.0402  0.4753  0.6693  0.7068  0.7559


In [15]:
treated = df[df.treat == 1].copy()
control = df[df.treat == 0].copy()

# Fit NN on control propensity scores
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nbrs.fit(control[['pscore']])

# Find matches for treated scores
distances, indices = nbrs.kneighbors(treated[['pscore']])
matched_control = control.iloc[indices.flatten()].copy()

# Construct matched dataset
matched_df = pd.concat([treated, matched_control]).reset_index(drop=True)
print(f"\nMatched dataset size: {len(matched_df)} "
      f"({len(treated)} treated + {len(matched_control)} matched controls)")


Matched dataset size: 370 (185 treated + 185 matched controls)


In [10]:
def smd(col, df_full, df_matched):
    """Compute SMD before and after matching for a single covariate."""
    # Before
    t_b = df_full[df_full.treat == 1][col]
    c_b = df_full[df_full.treat == 0][col]
    smd_before = (t_b.mean() - c_b.mean()) / np.sqrt(
        (t_b.var() + c_b.var()) / 2)
    # After
    t_a = df_matched[df_matched.treat == 1][col]
    c_a = df_matched[df_matched.treat == 0][col]
    smd_after = (t_a.mean() - c_a.mean()) / np.sqrt(
        (t_a.var() + c_a.var()) / 2)
    return abs(smd_before), abs(smd_after)

print("\n── Balance Table (|SMD|) ──")
print(f"{'Covariate':<12} {'Before':>8} {'After':>8} {'Pass?':>6}")
print("-" * 38)
for col in covariate_cols:
    before, after = smd(col, df, matched_df)
    flag = "✓" if after < 0.1 else "✗"
    print(f"{col:<12} {before:>8.4f} {after:>8.4f} {flag:>6}")


── Balance Table (|SMD|) ──
Covariate      Before    After  Pass?
--------------------------------------
age            0.2419   0.2613      ✗
educ           0.0448   0.0272      ✓
black          1.6677   0.0147      ✓
hispan         0.2769   0.1290      ✗
married        0.7195   0.1312      ✗
nodegree       0.2350   0.2058      ✗
re74           0.5958   0.0495      ✓
re75           0.2870   0.0804      ✓


In [16]:
# Raw (unmatched)
diff_raw = treated['re78'].mean() - control['re78'].mean()
t_raw, p_raw = stats.ttest_ind(treated['re78'], control['re78'])

# Matched
matched_treated_y  = matched_df[matched_df.treat == 1]['re78']
matched_control_y  = matched_df[matched_df.treat == 0]['re78']
matched_diff = matched_treated_y.mean() - matched_control_y.mean()
t_stat, p_val = stats.ttest_ind(matched_treated_y, matched_control_y)

print("\n══ Results ══════════════════════════════════════")
print(f"Raw Effect (Difference):      ${diff_raw:>10,.2f}  (p={p_raw:.4f})")
print(f"Recovered Effect (Matched):   ${matched_diff:>10,.2f}  (p={p_val:.4f})")
print(f"NSW Experimental Benchmark:   $  1,794.00  (ground truth)")
print("═" * 50)


══ Results ══════════════════════════════════════
Raw Effect (Difference):      $   -635.03  (p=0.3342)
Recovered Effect (Matched):   $    583.04  (p=0.4438)
NSW Experimental Benchmark:   $  1,794.00  (ground truth)
══════════════════════════════════════════════════
