<a href="https://colab.research.google.com/github/Thonyta17/Econ-5200/blob/main/Lab9/Causal_Inference_and_Propensity_Score_Matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

#load dataset
df = pd.read_csv("lalonde.csv")

In [2]:
# Naive Comparison
naive_diff = df[df.treat==1]['re78'].mean()-df[df.treat==0]['re78'].mean()
print(f"Naive Difference in Means: ${naive_diff:,.2f}")
# Expected Result: -$635.03

Naive Difference in Means: $-635.03


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,307.5,0.301303,27.363192,10.26873,0.395765,0.117264,0.415309,0.630293,4557.546569,2184.938207,6792.834483
std,177.390811,0.459198,9.881187,2.628325,0.489413,0.321997,0.493177,0.483119,6477.964479,3295.679043,7470.730792
min,1.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,154.25,0.0,20.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,238.283425
50%,307.5,0.0,25.0,11.0,0.0,0.0,0.0,1.0,1042.33,601.5484,4759.0185
75%,460.75,1.0,32.0,12.0,1.0,0.0,1.0,1.0,7888.49825,3248.9875,10893.5925
max,614.0,1.0,55.0,18.0,1.0,1.0,1.0,1.0,35040.07,25142.24,60307.93


In [8]:
# Define covariates
X = df[['age',"educ","black","hispan","married",'re75','re78']]
y = df['treat']

# Fit Propensity Model
logit = LogisticRegression(solver='liblinear')
logit.fit(X, y)

# Generate Scores
df['pscore'] = logit.predict_proba(X)[:, 1]

#view
df[['treat','pscore']].head()

Unnamed: 0,treat,pscore
0,1,0.432684
1,1,0.145755
2,1,0.744671
3,1,0.69514
4,1,0.694295


In [14]:
from sklearn.neighbors import NearestNeighbors

# Separate groups
treated = df[df.treat==1]
control = df[df.treat==0]

# Fit NN on Control scores
nbrs = NearestNeighbors(n_neighbors=1).fit(control[['pscore']])

# Find matches for Treated scores
distances, indices = nbrs.kneighbors(treated[['pscore']])
matched_control = control.iloc[indices.flatten()]

# Construct Matched DataFrame
matched_df = pd.concat([treated, matched_control])

print(f"Original treated size:{len(treated)}")
print(f"Original treated size:{len(matched_control)}")

Original treated size:185
Original treated size:185


In [15]:
from scipy import stats

# T-test on raw data
diff = treated['re78'].mean() - control['re78'].mean()
t_stat, p_val = stats.ttest_ind(treated['re78'], control['re78'])

print(f"Raw Effect (Difference): ${diff:,.2f}")
print(f"P-value: {p_val:.4f}")


# Isolate the matched outcomes
matched_treated = matched_df[matched_df.treat==1]['re78']
matched_control = matched_df[matched_df.treat==0]['re78']

# Estimate the causal effect (T-test on matched data)
matched_diff = matched_treated.mean() - matched_control.mean()
t_stat, p_val = stats.ttest_ind(matched_treated, matched_control)


print(f"Recovered Effect (Matched Difference): ${matched_diff:,.2f}")
print(f"P-value: {p_val:.4f}")

Raw Effect (Difference): $-635.03
P-value: 0.3342
Recovered Effect (Matched Difference): $1,020.07
P-value: 0.1607


In [16]:
!pip install wbgapi

Collecting wbgapi
  Downloading wbgapi-1.0.12-py3-none-any.whl.metadata (13 kB)
Downloading wbgapi-1.0.12-py3-none-any.whl (36 kB)
Installing collected packages: wbgapi
Successfully installed wbgapi-1.0.12
