#Directory, Libraries and Data

In [None]:
# Mount Google Drive to access files in Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching

/content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching


In [None]:
# install CausalInference library
!pip install CausalInference

Collecting CausalInference
  Downloading CausalInference-0.1.3-py3-none-any.whl.metadata (2.5 kB)
Downloading CausalInference-0.1.3-py3-none-any.whl (51 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CausalInference
Successfully installed CausalInference-0.1.3


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
from causalinference import CausalModel

In [None]:
# Load Data
# stackoverflow's survey data
df = pd.read_csv("stackoverflow.csv")
df.head()

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction,Data_scientist,Database_administrator,...,Developer_with_stats_math_background,DevOps,Embedded_developer,Graphic_designer,Graphics_programming,Machine_learning_specialist,Mobile_developer,Quality_assurance_engineer,Systems_administrator,Web_developer
0,United Kingdom,100000.0,20,0,1,5000,Remote,8,0,0,...,0,0,1,0,0,0,0,0,0,0
1,United States,130000.0,20,1,1,1000,Remote,9,0,0,...,0,1,1,0,0,0,0,1,0,1
2,United States,175000.0,16,0,1,10000,Not remote,7,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Germany,64516.12903,4,0,0,1000,Not remote,9,0,0,...,0,0,0,0,0,0,0,0,0,1
4,India,6636.323594,1,0,1,5000,Not remote,5,0,0,...,0,0,0,0,0,0,0,0,0,1


#Data Analysis

In [None]:
# Pick relevant columns
df = df.iloc[:,:8]
df.head(0)

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction


In [None]:
# Convert categorical variables to dummy variables, dropping the first category to avoid multicollinearity
df = pd.get_dummies(df, drop_first=True)

# Display the first row of the transformed DataFrame
df.head(1)

Unnamed: 0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States,Remote_Remote
0,100000.0,20,0,1,5000,8,False,False,True,False,True


In [None]:
# Calculate and display the mean of each numeric column grouped by the 'Remote_Remote' categorical variable
df.groupby('Remote_Remote').mean()

Unnamed: 0_level_0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States
Remote_Remote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,70201.175042,7.142857,0.332736,0.76051,2187.325563,7.551106,0.142857,0.096035,0.189878,0.480175
True,87400.737001,10.12,0.443478,0.766957,1712.756522,7.855652,0.069565,0.097391,0.121739,0.662609


In [None]:
# T-tests to compare means of continuous variables between two groups

# List of continuous variables to test
continuous = ['Salary', 'YearsCodedJob']

# Dictionaries to store test statistics and p-values
stat = {}
pvalue = {}

# Loop through each continuous variable
for x in continuous:
    # Separate the data into two groups based on 'Remote_Remote' value
    group1 = df.where(df.Remote_Remote == 0).dropna()[x]
    group2 = df.where(df.Remote_Remote == 1).dropna()[x]

    # Perform independent t-test and store the results
    stat[x], pvalue[x] = ss.ttest_ind(group1, group2)

# Convert p-values into a DataFrame for easy viewing
ttests = pd.DataFrame.from_dict(pvalue, orient='Index')
ttests.columns = ['pvalue']

# Print the DataFrame with p-values
print(ttests)

                     pvalue
Salary         1.057708e-22
YearsCodedJob  3.637316e-30


#Matching

In [None]:
# Check the information df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5594 entries, 0 to 5593
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Salary                  5594 non-null   float64
 1   YearsCodedJob           5594 non-null   int64  
 2   OpenSource              5594 non-null   int64  
 3   Hobby                   5594 non-null   int64  
 4   CompanySizeNumber       5594 non-null   int64  
 5   CareerSatisfaction      5594 non-null   int64  
 6   Country_Germany         5594 non-null   bool   
 7   Country_India           5594 non-null   bool   
 8   Country_United Kingdom  5594 non-null   bool   
 9   Country_United States   5594 non-null   bool   
 10  Remote_Remote           5594 non-null   bool   
dtypes: bool(5), float64(1), int64(5)
memory usage: 289.7 KB


In [None]:
# Isolate the outcome variable (y), treatment indicator (treat), and confounders

# Extract the outcome variable 'CareerSatisfaction'
y = df.CareerSatisfaction.values

# Extract the treatment indicator 'Remote_Remote'
treat = df.Remote_Remote.values

# Drop the columns 'Remote_Remote' and 'CareerSatisfaction' to get the confounders
confounders = df.drop(columns=["Remote_Remote", "CareerSatisfaction"]).values

In [None]:
# Perform matching to estimate treatment effects

# Create a CausalModel instance with the outcome variable, treatment indicator, and confounders
model = CausalModel(y, treat, confounders)

# Estimate treatment effects using matching and adjust for bias
model.est_via_matching(bias_adj=True)

# Print the estimated treatment effects
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.114      0.130      0.881      0.378     -0.140      0.368
           ATC      0.106      0.136      0.778      0.436     -0.161      0.372
           ATT      0.187      0.142      1.321      0.187     -0.090      0.464



  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef


#Robustness check

In [None]:
# Remove one confounder ('Hobby') from the list of confounders
confounders = df.drop(columns=["Remote_Remote",
                               "CareerSatisfaction",
                               "Hobby"]).values

In [None]:
# Perform matching to estimate treatment effects with the updated set of confounders
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj=True)

# Print the estimated treatment effects
print(model.estimates)


  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef



Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.148      0.131      1.128      0.259     -0.109      0.406
           ATC      0.140      0.138      1.013      0.311     -0.131      0.411
           ATT      0.220      0.137      1.602      0.109     -0.049      0.488

