In [1]:
# import packages
import pandas as pd
import os
import glob
import random
import numpy as np
import statistics
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import matplotlib.patches as mpatches

import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf
from scipy import stats
from scipy.stats import norm

## Data Prep

In [2]:
path = r'/Users/spencersu/Desktop/Efficacy Studies/Archdiocese'
df = pd.read_csv(os.path.join(path, 'Arch 2020-22 growth.csv'))


## clean the columns

df = df.iloc[:,1:]
df = df.rename(columns = {'Student Id [Sensitive]':'student_id', 'Full Name [Sensitive]':'student_name',
                         'School Year':'school_year','School Name':'school_name','Class Grade Level':'grade',
                         'Teacher Name [Sensitive]':'teacher_name','Lesson Type':'lesson_type',
                         'Multiple Choice Score':'reading_score','Word Count In Final Writing':'word_count',
                         'Number of Graded Writing Lessons':'num_graded_writing','Number of Lessons':'num_lesson'})
df['word_count'] = pd.to_numeric(df['word_count'],errors='coerce')

In [3]:
## word count
wc_left = df[df['word_count'].notna()].groupby(['student_id','student_name',
                                                'school_year','school_name',
                                                'grade','teacher_name'])['num_lesson','num_graded_writing'].sum()
wc_right = df[df['word_count'].notna()].groupby(['student_id','student_name',
                                                 'school_year','school_name',
                                                 'grade','teacher_name','lesson_type'])['word_count'].mean()

wc_reg = pd.merge(wc_left, wc_right, on=['student_id','student_name','school_year','school_name','grade','teacher_name'], how='inner')
wc_reg = wc_reg.reset_index()


## Reading score
rd_left = df[df['reading_score'].notna()].groupby(['student_id','student_name',
                                                   'school_year','school_name',
                                                   'grade','teacher_name'])['num_lesson','num_graded_writing'].sum()

rd_right = df[df['reading_score'].notna()].groupby(['student_id','student_name',
                                                    'school_year','school_name',
                                                    'grade','teacher_name','lesson_type'])['reading_score'].mean()

rd_reg = pd.merge(rd_left, rd_right, on=['student_id','student_name','school_year','school_name','grade','teacher_name'], how='inner')
rd_reg = rd_reg.reset_index()

In [7]:
wc_reg= wc_reg[wc_reg['school_year'] == '2021-2022']
rd_reg= rd_reg[rd_reg['school_year'] == '2021-2022']

In [8]:
results = smf.ols('word_count ~ num_lesson + C(grade) + C(school_name)', data=wc_reg).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             word_count   R-squared:                       0.372
Model:                            OLS   Adj. R-squared:                  0.364
Method:                 Least Squares   F-statistic:                     45.18
Date:                Sun, 03 Jul 2022   Prob (F-statistic):           1.35e-97
Time:                        18:33:08   Log-Likelihood:                -6672.2
No. Observations:                1081   AIC:                         1.337e+04
Df Residuals:                    1066   BIC:                         1.345e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

In [9]:
results = smf.ols('word_count ~ num_graded_writing + C(grade) + C(school_name)', data=wc_reg).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             word_count   R-squared:                       0.372
Model:                            OLS   Adj. R-squared:                  0.364
Method:                 Least Squares   F-statistic:                     45.18
Date:                Sun, 03 Jul 2022   Prob (F-statistic):           1.35e-97
Time:                        18:33:14   Log-Likelihood:                -6672.2
No. Observations:                1081   AIC:                         1.337e+04
Df Residuals:                    1066   BIC:                         1.345e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

In [10]:
results = smf.ols('reading_score ~ num_lesson + C(grade) + C(school_name)', data=rd_reg).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          reading_score   R-squared:                       0.166
Model:                            OLS   Adj. R-squared:                  0.158
Method:                 Least Squares   F-statistic:                     19.64
Date:                Sun, 03 Jul 2022   Prob (F-statistic):           9.78e-49
Time:                        18:33:17   Log-Likelihood:                -6715.5
No. Observations:                1496   AIC:                         1.346e+04
Df Residuals:                    1480   BIC:                         1.355e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------