**Objective**: To estimate the counterfactual outcome for an individual who attended the program, predicting what their income would have been if they had not attended

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

np.random.seed(23)


In [2]:
n = 50 # number of data points

data = pd.DataFrame({
    'training': np.random.choice([0, 1], size=n),  # Treatment, either trained or no training
    'education': np.random.randint(8, 20, size=n),  # years of education between 8 and 20
})

data['income'] = (
    30000 + 2000 * data['education'] + 5000 * data['training'] + np.random.normal(0, 2000, size=n)
)

data.head()

Unnamed: 0,training,education,income
0,1,10,54848.869605
1,0,18,66813.954886
2,0,11,51696.795738
3,1,8,53866.553887
4,0,16,57852.903252


In [3]:
np.random.choice([0, 1], size=2)

array([0, 1])

#### With Linear Regression

In [4]:
X = data[['training', 'education']]
y = data['income']
X.shape, y.shape

((50, 2), (50,))

In [5]:
model = LinearRegression()
model.fit(X, y)

In [6]:
intercept = model.intercept_
coef_training, coef_education = model.coef_

print("Intercept:", intercept)
print("Coefficient for training:", coef_training)
print("Coefficient for education:", coef_education)

Intercept: 29934.954693432042
Coefficient for training: 6107.695984666824
Coefficient for education: 1947.1544162859013


In [7]:
# Original data for Individual 0
individual_0 = data.loc[0]
education_0 = individual_0['education']
actual_income_0 = individual_0['income']

# Counterfactual scenario: setting training = 0
counterfactual_income_0 = intercept + (coef_training * 0) + (coef_education * education_0)

print("Actual income for Individual 0:", actual_income_0)
print("Counterfactual income for Individual 0 (if they had not attended):", counterfactual_income_0)

Actual income for Individual 0: 54848.86960513647
Counterfactual income for Individual 0 (if they had not attended): 49406.49885629106


#### With DoWhy

In [None]:
from dowhy import CausalModel

In [None]:
model = CausalModel(
    data=data,
    treatment='training',
    outcome='income',
    common_causes=['education']
)
model.view_model()

In [None]:
identified_estimand = model.identify_effect()


Mean Value is similar to the coefficient of 'Training' in the linear regression model.

In [None]:
estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.linear_regression"
)
print(estimate)

In [30]:
model = CausalModel(
    data=data,
    treatment='education',
    outcome='income',
    common_causes=['training']
)

identified_estimand = model.identify_effect()
estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.linear_regression"
)
print(estimate)

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
     d                          
────────────(E[income|training])
d[education]                    
Estimand assumption 1, Unconfoundedness: If U→{education} and U→income then P(income|education,training,U) = P(income|education,training)

## Realized estimand
b: income~education+training
Target units: ate

## Estimate
Mean value: 1939.8150994980388



  intercept_parameter = self.model.params[0]


Mean Value is similar to the coefficient of 'Education' in the linear regression model.