# Libraries, Directory and Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching Project

/content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching Project


In [3]:
# Install the causalInference library
!pip install CausalInference

Collecting CausalInference
  Downloading CausalInference-0.1.3-py3-none-any.whl.metadata (2.5 kB)
Downloading CausalInference-0.1.3-py3-none-any.whl (51 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CausalInference
Successfully installed CausalInference-0.1.3


In [4]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
from causalinference import CausalModel

In [10]:
# Load the Data
df = pd.read_csv("stackoverflow.csv")
df.head()

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction,Data_scientist,Database_administrator,...,Developer_with_stats_math_background,DevOps,Embedded_developer,Graphic_designer,Graphics_programming,Machine_learning_specialist,Mobile_developer,Quality_assurance_engineer,Systems_administrator,Web_developer
0,United Kingdom,100000.0,20,0,1,5000,Remote,8,0,0,...,0,0,1,0,0,0,0,0,0,0
1,United States,130000.0,20,1,1,1000,Remote,9,0,0,...,0,1,1,0,0,0,0,1,0,1
2,United States,175000.0,16,0,1,10000,Not remote,7,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Germany,64516.12903,4,0,0,1000,Not remote,9,0,0,...,0,0,0,0,0,0,0,0,0,1
4,India,6636.323594,1,0,1,5000,Not remote,5,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
# Check information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5594 entries, 0 to 5593
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               5594 non-null   object 
 1   Salary                                5594 non-null   float64
 2   YearsCodedJob                         5594 non-null   int64  
 3   OpenSource                            5594 non-null   int64  
 4   Hobby                                 5594 non-null   int64  
 5   CompanySizeNumber                     5594 non-null   int64  
 6   Remote                                5594 non-null   object 
 7   CareerSatisfaction                    5594 non-null   int64  
 8   Data_scientist                        5594 non-null   int64  
 9   Database_administrator                5594 non-null   int64  
 10  Desktop_applications_developer        5594 non-null   int64  
 11  Developer_with_st

# Data Analysis

In [12]:
# Pick the revelant columns
df = df.iloc[:,:8]
df.head()

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction
0,United Kingdom,100000.0,20,0,1,5000,Remote,8
1,United States,130000.0,20,1,1,1000,Remote,9
2,United States,175000.0,16,0,1,10000,Not remote,7
3,Germany,64516.12903,4,0,0,1000,Not remote,9
4,India,6636.323594,1,0,1,5000,Not remote,5


In [13]:
# Convert categorical variables to dummy variables
df = pd.get_dummies(df, drop_first=True, dtype=int)
df.head()

Unnamed: 0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States,Remote_Remote
0,100000.0,20,0,1,5000,8,0,0,1,0,1
1,130000.0,20,1,1,1000,9,0,0,0,1,1
2,175000.0,16,0,1,10000,7,0,0,0,1,0
3,64516.12903,4,0,0,1000,9,1,0,0,0,0
4,6636.323594,1,0,1,5000,5,0,1,0,0,0


In [14]:
# Calculate the mean for each column grouped by the treatment variable
df.groupby("Remote_Remote").mean()

Unnamed: 0_level_0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States
Remote_Remote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,70201.175042,7.142857,0.332736,0.76051,2187.325563,7.551106,0.142857,0.096035,0.189878,0.480175
1,87400.737001,10.12,0.443478,0.766957,1712.756522,7.855652,0.069565,0.097391,0.121739,0.662609


T-test for continuous, chi-square for binary variables

In [16]:
# T-tests to compare the means of the continuous variables

# Listing the continuous variables
continuous = ["Salary", "YearsCodedJob"]

# Dictionaries to store the test statistics and p-values
stat = {}
pvalue = {}

# Looping through the continuous variables
for var in continuous:
  # Separating the data based in the 2 groups on Remote_Remote
  group1 = df.where(df["Remote_Remote"] == 0).dropna()[var]
  group2 = df.where(df["Remote_Remote"] == 1).dropna()[var]

  # Performing the t-test
  stat[var], pvalue[var] = ss.ttest_ind(group1, group2)

# Convert the p-values into a dataframe for easy viewing
ttests = pd.DataFrame.from_dict({"stat": stat, "pvalue": pvalue})
ttests

Unnamed: 0,stat,pvalue
Salary,-9.849059,1.057708e-22
YearsCodedJob,-11.479414,3.637316e-30


# Matching

In [19]:
# Isolate the y, X and confounders
y = df.CareerSatisfaction.values
treat = df.Remote_Remote.values
confounders = df.drop(columns=["CareerSatisfaction", "Remote_Remote"]).values

In [21]:
# Perform the matching
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj=True)
print(model.estimates)

  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef



Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.114      0.130      0.881      0.378     -0.140      0.368
           ATC      0.106      0.136      0.778      0.436     -0.161      0.372
           ATT      0.187      0.142      1.321      0.187     -0.090      0.464



# Robustness check

In [22]:
# Remove one confounder ("Hobby") from the list
confounders = df.drop(columns=["CareerSatisfaction",
                               "Remote_Remote",
                               "Hobby"]).values

In [23]:
# Perform matching
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj=True)
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.148      0.131      1.128      0.259     -0.109      0.406
           ATC      0.140      0.138      1.013      0.311     -0.131      0.411
           ATT      0.220      0.137      1.602      0.109     -0.049      0.488



  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef
