# PSET 2 - Econometric Theory

In [29]:
# I import the Python modules (or libraries) that I'll need to solve the problems
import pandas as pd
import numpy as np
import scipy.stats as stats

In [6]:
# I read the csv of the dataset and I assign it to a variable
data = pd.read_csv("tracking.csv")

## Exercise 1

### 1.

In [21]:
grouped_data = data[["tracking", "scoreendfirstgrade", "schoolid"]].groupby(["schoolid"]).mean()

In [30]:
grouped_data.head()

Unnamed: 0_level_0,tracking,scoreendfirstgrade
schoolid,Unnamed: 1_level_1,Unnamed: 2_level_1
430,1.0,-0.184369
432,1.0,-0.178371
436,1.0,-0.068224
443,0.0,-0.024504
451,0.0,-0.757769


In [27]:
tracking_mean = grouped_data.groupby(["tracking"]).mean()
ATE_estimation = tracking_mean.iloc[1]-tracking_mean.iloc[0]

In [28]:
ATE_estimation

scoreendfirstgrade    0.133913
dtype: float64

From this simple estimate it seems that tracking has a positive effect on end of first grade scores

### 2.

Documentation for the `stats-ttest_ind` function: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

In [40]:
# I calculate the T-test for the means of two independent samples of scores.
# This test assumes that the populations have identical variances by default, but I set equal_var to False in order to use the Welch's t-test and drop this assumption.
observed_t = stats.ttest_ind(grouped_data[grouped_data['tracking'] == 1]['scoreendfirstgrade'],
                            grouped_data[grouped_data['tracking'] == 0]['scoreendfirstgrade'], equal_var = False)
# I print the output of the test
print(f"This is the outcome of our Welch's test: \n 1) T-statistic: {observed_t.statistic} \n 2) P-Value {observed_t.pvalue}")

# Generate permutations (I chose 10000 as an arbitrary number, no specific reason for it)
num_permutations = 10000
# Here I store the outcome of each simulation
permutation_t_stats = []

# This is a for-loop that repeats for 10000 times
for i in range(num_permutations):
    # Creates a permuted version of the DataFrame by randomly shuffling the rows without replacement
    permuted_df = grouped_data.sample(frac=1/4, replace=False)
    # Calculates the t-statistic for the permuted data
    permutation_t = stats.ttest_ind(permuted_df[permuted_df['tracking'] == 1]['scoreendfirstgrade'],
                                   permuted_df[permuted_df['tracking'] == 0]['scoreendfirstgrade']).statistic
    
    permutation_t_stats.append(permutation_t)

# Determine critical value
critical_value = np.percentile(permutation_t_stats, 10)

# Compare observed statistic to critical value
if abs(observed_t.statistic) > critical_value:
    print("Finding is robust at 10% level randomization inference")
else:
    print("Finding is not robust at 10% level randomization inference")

This is the outcome of our Welch's test: 
 1) T-statistic: 1.678443563442577 
 2) P-Value 0.09622272798001359
Finding is robust at 10% level randomization inference


In [44]:
critical_value

-0.30192626054885086