In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

from functions import *

In [18]:
X = pd.read_csv("indep_x.csv", index_col = 0)

In [19]:
check_balance(X, "treated", X.drop(["treated"], axis = 1).columns)

  t_stat, p_value = stats.ttest_ind(treatment_group, control_group, equal_var=False)  # Welch's t-test


Unnamed: 0,Covariate,Test,p-value,Significant,Difference,Standardized Difference
0,age,Welch's t-test,0.410959,False,-0.159262,-0.133256
1,gender,Welch's t-test,0.570504,False,-0.046138,-0.09193
2,scholarship,Welch's t-test,0.340511,False,0.040157,0.154775
3,1st_year,Welch's t-test,0.052406,False,0.07758,0.316336
4,gpa,Welch's t-test,0.014175,True,1.20745,0.401159
5,1st_time,Welch's t-test,0.083237,False,0.038961,0.282892
6,taste,Welch's t-test,0.019981,True,0.239747,0.380138
7,importance,Welch's t-test,0.090261,False,0.18216,0.275559
8,expected_grade,Welch's t-test,0.200697,False,0.534689,0.207866
9,knowledge,Welch's t-test,0.269851,False,0.082365,0.179123


In [20]:
balance_data = X.copy() #i'm making a copy of the dataset

In [21]:
X_notreated = X.copy()
X_notreated = X_notreated.drop(["treated"], axis = 1)

In [22]:
results = [] #initialize an empty list in wich the result for each variable will be stored

In [23]:
# We run a regression for each variable: Variable ~ Treatment
for col in X_notreated.columns:
    y = balance_data[col]
    X = sm.add_constant(balance_data["treated"])  # adds intercept
    model = sm.OLS(y, X).fit(cov_type="HC3")
    fval = model.f_pvalue #F-statistic
    results.append((col, fval))

In [24]:
balance_test_df = pd.DataFrame(results, columns=["Variable", "F-value"])
balance_test_df.sort_values("F-value", ascending=False, inplace=True)

In [25]:
balance_test_df

Unnamed: 0,Variable,F-value
1,gender,0.573031
0,age,0.413999
2,scholarship,0.34357
9,knowledge,0.272979
8,expected_grade,0.203635
7,importance,0.09229
5,1st_time,0.083218
3,1st_year,0.053363
6,taste,0.020761
4,gpa,0.014771


In [14]:
print(balance_test_df.to_latex(
        index=True,  
        escape=False, 
    ))

\begin{tabular}{llr}
\toprule
{} &        Variable &   F-value \\
\midrule
1 &          gender &  0.573031 \\
0 &             age &  0.413999 \\
2 &     scholarship &  0.343570 \\
9 &       knowledge &  0.272979 \\
8 &  expected_grade &  0.203635 \\
7 &      importance &  0.092290 \\
5 &        1st_time &  0.083218 \\
3 &        1st_year &  0.053363 \\
6 &           taste &  0.020761 \\
4 &             gpa &  0.014771 \\
\bottomrule
\end{tabular}



  print(balance_test_df.to_latex(


In [32]:
print((balance_data[balance_data["treated"]==1]["gpa"].mean(), balance_data[balance_data["treated"]==0]["gpa"].mean()))
print((balance_data[balance_data["treated"]==1]["taste"].mean(), balance_data[balance_data["treated"]==0]["taste"].mean()))
print((balance_data[balance_data["treated"]==1]["1st_year"].mean(), balance_data[balance_data["treated"]==0]["1st_year"].mean()))

(24.973684210526315, 23.766233766233768)
(3.460526315789474, 3.220779220779221)
(0.9736842105263158, 0.8961038961038961)


In [33]:
results_nout = []

In [35]:
# We run a regression for each variable: Variable ~ Treatment
balance_data_nout = balance_data[balance_data["gpa"] != 17].copy()

for col in X_notreated.columns:
    y = balance_data_nout[col]
    X = sm.add_constant(balance_data_nout["treated"])  # adds intercept
    model = sm.OLS(y, X).fit(cov_type="HC3")
    fval = model.f_pvalue #F-statistic
    results_nout.append((col, fval))

In [36]:
balance_test_df_nout = pd.DataFrame(results_nout, columns=["Variable", "F-value"])
balance_test_df_nout.sort_values("F-value", ascending=False, inplace=True)

In [38]:
balance_data_nout.shape

(145, 11)

In [37]:
print(balance_test_df_nout.to_latex(
        index=True,  
        escape=False, 
    ))

\begin{tabular}{llr}
\toprule
{} &        Variable &   F-value \\
\midrule
1 &          gender &  0.700669 \\
0 &             age &  0.669296 \\
2 &     scholarship &  0.413972 \\
9 &       knowledge &  0.333413 \\
8 &  expected_grade &  0.167890 \\
4 &             gpa &  0.142701 \\
7 &      importance &  0.108964 \\
5 &        1st_time &  0.083114 \\
3 &        1st_year &  0.075599 \\
6 &           taste &  0.019316 \\
\bottomrule
\end{tabular}



  print(balance_test_df_nout.to_latex(
