#### Stratification 

Input: list of Propensity scores 
Output: ATE 

Governing Equation:

$$\hat{\Delta}_s = \sum_{n=1}^{K} \frac{N_j}{N}\ \{ N_1j^-1\sum_{i=1}^{N} T_i Y_i I(\hat{e}\in \hat{Q_j}) - N_0j^-1\sum_{i=1}^{N} T_i Y_i I(\hat{e}\in \hat{Q_j}) 	\} $$

In [150]:
# packages as needed 
import numpy as np
import math
import pandas as pd

In [151]:
lowDim_dataset = pd.read_csv('../data/lowDim_dataset.csv')
highDim_dataset = pd.read_csv('../data/highDim_dataset.csv')
Y=lowDim_dataset['Y'] #response
D=lowDim_dataset['A'] #treatment/control column

In [152]:
# get real Propesnity scores here
# scores = PropensityScoreFunction( something )

In [153]:
# Function for Creating Strata from Propensity Scores

def stratify(scores):
    
    # Make data frame holding Y, A, Propesnity score, 
    df = pd.DataFrame({
        'Y': Y,
        'D': D,
        'Scores': scores
    })
    K = 5
    N = len(df)          # Number of individuals

    # Create stratum and stratum limits
    Q1 = np.quantile(scores, .20)
    Q2 = np.quantile(scores, .40)
    Q3 = np.quantile(scores, .60)
    Q4 = np.quantile(scores, .80)
    Q5 = np.quantile(scores, 1.0)
    quin1 = df[df['Scores']<= Q1]
    quin2 = df[(df['Scores']> Q1) & (df['Scores']<= Q2)]
    quin3 = df[(df['Scores']> Q2) & (df['Scores']<= Q3)]
    quin4 = df[(df['Scores']> Q3) & (df['Scores']<= Q4)]
    quin5 = df[df['Scores']> Q4]

    quintiles = [quin1, quin2, quin3, quin4, quin5]
    Q_ranges = [None, Q1, Q2, Q3, Q4, Q5]
    """print("\nquin1: \n", Q1)
    print("\nquin2: \n ", quin2)
    print("\nquin3: \n", quin3)
    print("\nquin4: \n", quin4)
    print("\nquin5: \n", quin5)"""
    return [quintiles, Q_ranges]
    

In [154]:
# Function for Calculating ATE from strata

def strat_ATE(quintiles, Q_ranges):
    results = []
    for i, stratum in enumerate(quintiles): 
        i+=1
        sum1 = 0
        sum2 = 0

        # Calc e for each sample within each strata (Should all be 1)
        e_list = [1 if (score<=Q_ranges[i]) and (score > 0 if Q_ranges[i-1]== None else score > Q_ranges[i-1]) else 0 for score in stratum['Scores']]
        # Summation of treated samples within strata
        sum1 = sum([T*Y*e for T,Y,e in zip(stratum['Y'],stratum['D'], e_list)])
        # Summation of untreated samples within strata
        sum2 = sum([(1-T)*Y*e for T,Y,e in zip(stratum['Y'],stratum['D'], e_list)]) 

        """
        For Troubleshooting:     Erase if needed
        print("\n\nStratum: ", i)
        print(sum1)
        print(sum2)
        print("With  N * terms ")
        print(sum1/stratum.D.value_counts()[1])
        print(sum2/stratum.D.value_counts()[0])
        print("Difference ")
        print((sum1/stratum.D.value_counts()[1])-(sum2/stratum.D.value_counts()[0]))
        print("Nj / N * Final Sum ")
        print(len(stratum)/len(df) * ((sum1/stratum.D.value_counts()[1])-(sum2/stratum.D.value_counts()[0])))
        """
        results.append(len(stratum)/N * ((sum1/stratum.D.value_counts()[1])-(sum2/stratum.D.value_counts()[0])))

    return sum(results)

Below is "test run" with randomly generated scores. Remove for submission

In [155]:
import random        # only need for troubleshooting code. 

In [156]:
# Make dummy list of PS
scores = []  # Dummy list for now
for i in range(475):  
    scores.append(round(random.uniform(4.02,43.05), 4))

quintiles, Q_ranges = stratify(scores)

print("Estimated ATE: ", strat_ATE(quintiles, Q_ranges))

Estimated ATE:  27.785849967476842
