## Directory, Libraries and Data

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
from causalinference import CausalModel

In [28]:
df = pd.read_csv('stackoverflow2.csv')
df.head()

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction,Data_scientist,Database_administrator,...,Developer_with_stats_math_background,DevOps,Embedded_developer,Graphic_designer,Graphics_programming,Machine_learning_specialist,Mobile_developer,Quality_assurance_engineer,Systems_administrator,Web_developer
0,United Kingdom,100000.0,20,0,1,5000,Remote,8,0,0,...,0,0,1,0,0,0,0,0,0,0
1,United States,130000.0,20,1,1,1000,Remote,9,0,0,...,0,1,1,0,0,0,0,1,0,1
2,United States,175000.0,16,0,1,10000,Not remote,7,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Germany,64516.12903,4,0,0,1000,Not remote,9,0,0,...,0,0,0,0,0,0,0,0,0,1
4,India,6636.323594,1,0,1,5000,Not remote,5,0,0,...,0,0,0,0,0,0,0,0,0,1


## Data Analysis

In [29]:
#picking variables
df = df.iloc[:, :8]
df.head(0)

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction


In [30]:
df = pd.get_dummies(df, drop_first = True)
df = df.astype(int)
df

Unnamed: 0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States,Remote_Remote
0,100000,20,0,1,5000,8,0,0,1,0,1
1,130000,20,1,1,1000,9,0,0,0,1,1
2,175000,16,0,1,10000,7,0,0,0,1,0
3,64516,4,0,0,1000,9,1,0,0,0,0
4,6636,1,0,1,5000,5,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5589,88750,8,0,1,1000,8,0,0,1,0,0
5590,6460,2,0,1,10000,6,0,1,0,0,0
5591,4228,2,0,1,1,10,0,1,0,0,0
5592,33750,1,1,1,100,6,0,0,1,0,0


In [31]:
df.groupby('Remote_Remote').mean()

Unnamed: 0_level_0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States
Remote_Remote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,70201.00777,7.142857,0.332736,0.76051,2187.325563,7.551106,0.142857,0.096035,0.189878,0.480175
1,87400.631304,10.12,0.443478,0.766957,1712.756522,7.855652,0.069565,0.097391,0.121739,0.662609


In [33]:
#T-Test Loop

#Get the continous variables
continuous = ['Salary', 'YearsCodedJob']

#Storing results
stat = {}
p = {}

#loop
for i in continuous:
    group1 = df.where(df.Remote_Remote == 0).dropna()[i]
    group2 = df.where(df.Remote_Remote == 1).dropna()[i]
    stat[i], p[i] = ss.ttest_ind(group1, group2)
    
ttests = pd.DataFrame.from_dict(p, orient = "Index")
ttests.columns = ['p-values']
ttests

Unnamed: 0,p-values
Salary,1.0576530000000001e-22
YearsCodedJob,3.637316e-30


## Matching 

In [34]:
#Isolating the y, treat and Confounders
treat = df.Remote_Remote.values
y = df.CareerSatisfaction.values
confounders = df.drop(columns=['Remote_Remote',
                              'CareerSatisfaction']).values

In [35]:
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj = True)
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.179      0.142      1.258      0.208     -0.100      0.457
           ATC      0.163      0.149      1.093      0.274     -0.129      0.456
           ATT      0.312      0.159      1.970      0.049      0.002      0.623



  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef


## Robustness Check

In [36]:
#Remove 1 Confounder
confounders = df.drop(columns = ['Remote_Remote',
                                'CareerSatisfaction',
                                'Hobby']).values

In [37]:
#Matching
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj = True)
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.196      0.153      1.280      0.200     -0.104      0.496
           ATC      0.192      0.162      1.185      0.236     -0.126      0.509
           ATT      0.231      0.168      1.373      0.170     -0.099      0.561



  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef
