In [13]:
import pandas as pd
import numpy as np
from scipy import stats

In [14]:
X = pd.read_csv("indep_x.csv", index_col = 0)

In [15]:
# Note: this code what generated by Gemini, under a prompt, it was not our original creation

def check_balance(data, treatment_col, covariates, significance_level=0.05):
    """
    Checks balance between treatment and control groups for given covariates.

    Args:
        data (pd.DataFrame): The dataset.
        treatment_col (str): The name of the treatment column (binary: 0 or 1).
        covariates (list): A list of covariate names.
        significance_level (float): The significance level for balance tests.

    Returns:
        pd.DataFrame: A DataFrame showing balance test results.
    """

    results = []
    for covariate in covariates:
        treatment_group = data[data[treatment_col] == 1][covariate]
        control_group = data[data[treatment_col] == 0][covariate]

        if pd.api.types.is_numeric_dtype(data[covariate]):
            # Numeric covariate: t-test
            t_stat, p_value = stats.ttest_ind(treatment_group, control_group, equal_var=False)  # Welch's t-test
            test_type = "Welch's t-test"
            difference = treatment_group.mean() - control_group.mean()
            std_diff = difference / np.sqrt(0.5*(treatment_group.var() + control_group.var()))
        else:
            # Categorical covariate: chi-squared test
            contingency_table = pd.crosstab(data[covariate], data[treatment_col])
            chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
            test_type = "Chi-squared test"
            difference = None
            std_diff = None

        results.append({
            'Covariate': covariate,
            'Test': test_type,
            'p-value': p_value,
            'Significant': p_value < significance_level,
            'Difference': difference,
            'Standardized Difference': std_diff,
        })

    return pd.DataFrame(results)

In [11]:
X.head()

Unnamed: 0,age,gender,scholarship,1st_year,gpa,1st_time,taste,importance,expected_grade,knowledge,treated
1,19,0,1,1,23,1,4,3,22,0,0
2,19,1,0,1,25,1,3,4,21,0,1
3,19,0,0,0,23,1,3,3,19,0,0
4,19,1,0,1,25,1,3,3,19,1,1
5,23,1,0,1,20,1,3,3,22,0,0


In [12]:
check_balance(X, "treated", X.drop(["treated"], axis = 1).columns)

Unnamed: 0,Covariate,Test,p-value,Significant,Difference,Standardized Difference
0,age,Welch's t-test,0.410959,False,-0.159262,-0.133256
1,gender,Welch's t-test,0.570504,False,-0.046138,-0.09193
2,scholarship,Welch's t-test,0.340511,False,0.040157,0.154775
3,1st_year,Welch's t-test,0.052406,False,0.07758,0.316336
4,gpa,Welch's t-test,0.799217,False,0.118934,0.04118
5,1st_time,Welch's t-test,0.083237,False,0.038961,0.282892
6,taste,Welch's t-test,0.019981,True,0.239747,0.380138
7,importance,Welch's t-test,0.090261,False,0.18216,0.275559
8,expected_grade,Welch's t-test,0.200697,False,0.534689,0.207866
9,knowledge,Welch's t-test,0.269851,False,0.082365,0.179123
