# Evaluating the Impact of Job Training Programs on Earnings using the Lalonde Dataset.
This dataset is a common benchmark for causal analysis. Original analysis of the study was done by Robert LaLonde and published in his 1986 Evaluating the Econometric Evaluations of Training Programs with Experimental Data paper.
We seek to estimate the causal impact of a job training program on the post-treatment earnings of individuals in the treated group compared to those in the control group.

In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
# Load the dataset
data = pd.read_csv('/Users/prachijhamb/Downloads/lalonde_data.csv')
data.info()
# A data frame with 614 observations (185 treated, 429 control). There are 10 variables measured for each individual. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        614 non-null    object 
 1   treat     614 non-null    int64  
 2   age       614 non-null    int64  
 3   educ      614 non-null    int64  
 4   black     614 non-null    int64  
 5   hispan    614 non-null    int64  
 6   married   614 non-null    int64  
 7   nodegree  614 non-null    int64  
 8   re74      614 non-null    float64
 9   re75      614 non-null    float64
 10  re78      614 non-null    float64
dtypes: float64(3), int64(7), object(1)
memory usage: 52.9+ KB


In [12]:
from sklearn.preprocessing import StandardScaler
#Define the treatment and covariate columns
treatment_col = 'treat'
covariate_cols = ['age', 'educ','black', 'hispan', 'married', 'nodegree', 're74', 're75']
# Scale the covariates
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data[covariate_cols])
# Fit the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)  # also increasing iterations
logistic_model.fit(X_scaled, data[treatment_col])
# Add the propensity score to the data frame
data['propensity_score'] = logistic_model.predict_proba(X_scaled)[:, 1]

In [13]:
def match_treated_and_control(df, treatment_col, propensity_score_col):
    treated_df = df[df[treatment_col] == 1]
    control_df = df[df[treatment_col] == 0]

    # Nearest-neighbor matching based on propensity scores
    nn_matcher = NearestNeighbors(n_neighbors=1)
    nn_matcher.fit(control_df[propensity_score_col].values.reshape(-1, 1))

    distances, indices = nn_matcher.kneighbors(treated_df[propensity_score_col].values.reshape(-1, 1))
    matched_control_df = control_df.iloc[indices.flatten()]

    return treated_df, matched_control_df

treated_df, matched_control_df = match_treated_and_control(data, treatment_col, 'propensity_score')


In [14]:
def estimate_ate(treated_df, control_df, outcome_col):
    treated_outcome_mean = treated_df[outcome_col].mean()
    control_outcome_mean = control_df[outcome_col].mean()
    return treated_outcome_mean - control_outcome_mean

# Estimate the average treatment effect (ATE)
ate = estimate_ate(treated_df, matched_control_df, 're78')
print(f"Estimated Average Treatment Effect (ATE): {ate:.2f}")

Estimated Average Treatment Effect (ATE): 1110.12


# Method 2 - Inverse Probability of Treatment Weighting 
Used to estimate the Average Treatment Effect (ATE) by weighting individuals in the treatment and control groups based on the inverse of their propensity scores.

In [15]:
def calculate_weights(df, treatment_col, propensity_score_col):
    treatment = df[treatment_col]
    propensity_score = df[propensity_score_col]
    weights = np.where(treatment == 1, 1 / propensity_score, 1 / (1 - propensity_score))
    return weights

# Calculate IPTWs and add them to the dataset
data['iptw'] = calculate_weights(data, treatment_col, 'propensity_score')

In [16]:
def weighted_outcome_analysis(df, treatment_col, outcome_col, weights_col):
    weighted_outcome_sum = np.sum(df[treatment_col] * df[outcome_col] * df[weights_col])
    weighted_treatment_sum = np.sum(df[treatment_col] * df[weights_col])
    treated_outcome_mean = weighted_outcome_sum / weighted_treatment_sum

    weighted_outcome_sum = np.sum((1 - df[treatment_col]) * df[outcome_col] * df[weights_col])
    weighted_treatment_sum = np.sum((1 - df[treatment_col]) * df[weights_col])
    control_outcome_mean = weighted_outcome_sum / weighted_treatment_sum

    return treated_outcome_mean - control_outcome_mean

# Estimate the average treatment effect (ATE) using IPTW
ate_iptw = weighted_outcome_analysis(data, treatment_col, 're78', 'iptw')
print(f"Estimated Average Treatment Effect (ATE) using IPTW: {ate_iptw:.2f}")

Estimated Average Treatment Effect (ATE) using IPTW: 232.42


In [17]:
# Method 3 - Doubly Robust (DR)- estimate the Average Treatment Effect (ATE) by 
# combining the strengths of both Propensity Score Matching and Inverse Probability of Treatment Weighting.

In [18]:
from sklearn.linear_model import LinearRegression

def fit_outcome_regression(df, treatment_col, covariate_cols, outcome_col):
    treated_df = df[df[treatment_col] == 1]
    control_df = df[df[treatment_col] == 0]

    treated_model = LinearRegression().fit(treated_df[covariate_cols], treated_df[outcome_col])
    control_model = LinearRegression().fit(control_df[covariate_cols], control_df[outcome_col])

    return treated_model, control_model

treated_outcome_model, control_outcome_model = fit_outcome_regression(data, treatment_col, covariate_cols, 're78')

In [19]:
def doubly_robust_estimation(df, treatment_col, covariate_cols, outcome_col, propensity_score_col, treated_model, control_model):
    treated_df = df[df[treatment_col] == 1]
    control_df = df[df[treatment_col] == 0]

    treated_term = treated_df[outcome_col] - treated_model.predict(treated_df[covariate_cols]) + treated_df[propensity_score_col] * treated_df[outcome_col] / treated_df[propensity_score_col]
    control_term = control_model.predict(control_df[covariate_cols]) - control_df[propensity_score_col] * control_df[outcome_col] / (1 - control_df[propensity_score_col])

    treated_outcome_mean = np.mean(treated_term)
    control_outcome_mean = np.mean(control_term)

    return treated_outcome_mean - control_outcome_mean

ate_dr = doubly_robust_estimation(data, treatment_col, covariate_cols, 're78', 'propensity_score', treated_outcome_model, control_outcome_model)
print(f"Estimated Average Treatment Effect (ATE) using Doubly Robust Estimation: {ate_dr:.2f}")

Estimated Average Treatment Effect (ATE) using Doubly Robust Estimation: 1580.86
