In [None]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from causalml.dataset import synthetic_data
from sklearn.model_selection import KFold

In [None]:
from importlib import reload
import causal_inference.propensity as causal

reload(causal)

In [None]:
def calculate_ate(ite):
    return ite.mean().round(2), ite.std().round(2)

def calculate_propensity(propensity):
    return propensity.mean().round(2), propensity.std().round(2)

In [None]:

y, X, treatment, true_ite, expected_outcome, true_propensity = synthetic_data(mode=1,
                                                                              n=1000,
                                                                              p=10,
                                                                              sigma=2)

# As the mean propensity doesn't say lot it would be nice to plot the true propensity to see the overlap between
# control and treated.
# It should be done like: http://ethen8181.github.io/machine-learning/ab_tests/causal_inference/matching.html

print("The true ATE of the generated data is",
      calculate_ate(true_ite)[0],
      "with standard deviation equal to",
      calculate_ate(true_ite)[1],
      ".")

print("The average propensity score value is equal to",
      calculate_propensity(true_propensity)[0],
      "with standard deviation equal to",
      calculate_propensity(true_propensity)[1],
      ".")


In [None]:
# Make a df from it

df = pd.DataFrame(X, columns = ['x' + str(i) for i in range(X.shape[1])])
df['treatment'] = treatment
df['outcome'] = y

df.head()


In [None]:
from __future__ import division

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
sns.set_palette("colorblind")

%matplotlib inline


In [None]:

sns.kdeplot(df[df.treatment == 0]['outcome'], label="untreated")
sns.kdeplot(df[df.treatment == 1]['outcome'], label="untreated")

In [None]:
print("The true ATE of the generated data is equal to",
      calculate_ate(true_ite)[0],".")
print("Estimated ATE is equal to",
      calculate_ate(df[df.treatment == 1]['outcome'])[0] - calculate_ate(df[df.treatment == 0]['outcome'])[0],".")


In [None]:
from scipy.stats import wasserstein_distance
from scipy import stats

def calc_ndiff(covariate_control, covariate_treated):
    m_c = covariate_control.mean()
    m_t = covariate_treated.mean()
    std_c = covariate_control.std()
    std_t = covariate_treated.std()
    ndiff = (m_t-m_c) / np.sqrt((std_c**2+std_t**2)/2)
    return ndiff

In [None]:
df_summary = pd.DataFrame(index = df.iloc[:,0:X.shape[1]].columns.to_list())

In [None]:
for i in range(X.shape[1]):
    covariate_control = df[df.treatment == 0].iloc[:,i]
    covariate_treated = df[df.treatment == 1].iloc[:,i]
    df_summary.iloc[i,:]['norm-diff'] = calc_ndiff(covariate_control, covariate_treated)
    print(wasserstein_distance(covariate_control,covariate_treated))
    # If p-value is low then we can reject the null hypothesis that
    # the distributions of the two samples are the same
    print(stats.ks_2samp(covariate_control,covariate_treated)[1])

In [None]:
# Similiar way to do the below, probably better. Check this. Goal Gio: prepare Table 1 on COVID data.

# http://ethen8181.github.io/machine-learning/ab_tests/causal_inference/matching.html

In [None]:
norm_diff, w_dist, ks_test = [], [], []

# make it an iteration over the methods
# maybe it should be a more pythonic way to do it but less clear?

for index, row in df_summary.iterrows():
    idx = int(index[1])
    covariate_control = df[df.treatment == 0].iloc[:,idx]
    covariate_treated = df[df.treatment == 1].iloc[:,idx]
    norm_diff.append(calc_ndiff(covariate_control, covariate_treated).round(2))
    w_dist.append(wasserstein_distance(covariate_control,covariate_treated).round(2))
    ks_test.append(stats.ks_2samp(covariate_control,covariate_treated)[1].round(3))

df_summary['norm_dist'] = norm_diff
df_summary['w_dist'] = w_dist
df_summary['ks_test'] = ks_test
df_summary

In [None]:
# can we the same tests on a weighted sample?

In [None]:
#First analyze the propensity

df['p_score'] = true_propensity
p_score_control = df[df.treatment == 0]['p_score']
p_score_treated = df[df.treatment == 1]['p_score']
print('treatment count:', p_score_control.shape)
print('control count:', p_score_treated.shape)

In [None]:
# Kind of implements matching

# http://ethen8181.github.io/machine-learning/ab_tests/causal_inference/matching.html

In [None]:
# statistical tests http://benalexkeen.com/comparative-statistics-in-python-using-scipy/

In [None]:
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

sns.distplot(p_score_control, label='control')
sns.distplot(p_score_treated, label='treated')
plt.xlim(0, 1)
plt.title('Propensity Score Distribution of Control vs Treatment')
plt.ylabel('Density')
plt.xlabel('Scores')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
X_new = np.hstack((X, treatment.reshape(len(treatment), 1)))

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y,
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)


In [None]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()



In [None]:
my_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('estimate_propensity', causal.PropensityEstimator()),
    ('model', model)
])

In [None]:
from sklearn.metrics import mean_squared_error

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_squared_error(y_valid, preds)
print('MSE:', score)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
# define the model cross-validation configuration

cv = KFold(n_splits=10)

# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(my_pipeline,
                         X_train, y_train,
                         scoring='neg_mean_squared_error',
                         cv=cv, n_jobs=-1)

# convert MAE scores to positive values
scores
# summarize the model performance
print('MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
from sklearn.model_selection import cross_val_score


scores = cross_val_score(my_pipeline, X_new, y, cv=5,
                         scoring = 'neg_mean_squared_error')
scores

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

scoring = ['neg_mean_squared_error', 'r2']

scores = cross_validate(model, X_new, y, scoring=scoring)

sorted(scores.keys())

In [None]:
scores['fit_time']

In [None]:
from sklearn.metrics import make_scorer
scoring = {'prec_macro': 'precision_macro',
           'rec_macro': make_scorer(recall_score, average='macro')}

scores = cross_validate(model, X, y, scoring=scoring,
                        cv=5, return_train_score=True)

sorted(scores.keys())
scores['train_rec_macro']

In [None]:
# What is the godness of fit metric?
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)

In [None]:
distances[1]

In [None]:
indices[731]

In [None]:
indices.reshape(-1,1).shape