## NJSLA Spring 25 ELA Testing Time Linear Regression

Analysis of the ELA results of the Spring 25 NJSLA

In [1]:
# importing packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import datetime as dt

In [2]:
# importing dataset
df_ela = pd.read_excel(r"S:\ASSESSMENTS\NJSLA 2025\NJSLA 2025 ELA Math Summative Record File.xlsx", parse_dates = ['Unit1OnlineTestEndDateTime','Unit1OnlineTestStartDateTime',
                                                                                                                   'Unit2OnlineTestStartDateTime','Unit2OnlineTestEndDateTime',
                                                                                                                  'Unit3OnlineTestStartDateTime','Unit3OnlineTestEndDateTime'])
df_ela = df_ela[df_ela['TestStatus']=='Attempt']

In [3]:
# function for proficiency
def proficient (row):
    if row['TestScaleScore'] >= 750:
        return 1
    elif row['TestScaleScore'] < 750:
        return 0

df_ela['proficient'] = df_ela.apply(proficient, axis = 1)

In [4]:
# Tesing Time

# unit 1
df_ela['Unit 1 Testing Time'] = df_ela['Unit1OnlineTestEndDateTime'] - df_ela['Unit1OnlineTestStartDateTime']
df_ela['Unit 1 Testing Time'] = df_ela['Unit 1 Testing Time'].dt.total_seconds()/60

# unit 2
df_ela['Unit 2 Testing Time'] = df_ela['Unit2OnlineTestEndDateTime'] - df_ela['Unit2OnlineTestStartDateTime']
df_ela['Unit 2 Testing Time'] = df_ela['Unit 2 Testing Time'].dt.total_seconds()/60

# total testing time for ELA03
df_ela['Total_Test_Time'] = df_ela['Unit 1 Testing Time'] + df_ela['Unit 2 Testing Time']

In [5]:
subject = 'ELA'

In [6]:
# filtering to return ELA TestCodes
# df_ela_4_9 = df_ela[df_ela['TestCode'].isin(['ELA04', 'ELA05', 'ELA06', 'ELA07','ELA08', 'ELA09'])]
df_ela_6_9 = df_ela[df_ela['TestCode'].isin(['ELA06', 'ELA07','ELA08', 'ELA09'])]#--> filtering for middle school grades

In [None]:
q3 = df_ela_6_9['Total_Test_Time'].quantile(.75) #--> 75th percentile
q1 = df_ela_6_9['Total_Test_Time'].quantile(.25) #--> 25th percentile
iqr = q3-q1 # IQR

# finding outliers
outlier_upper = q3 + (1.5 * iqr) 
outlier_lower = q1 - (1.5 * iqr)

print(f"An outlier in the for the NJSLA {subject} is a Test Scale Score greater than or equal to {round(outlier_upper,2)} minutes",
      f",or a Test Scale Score less than or equal to {round(outlier_lower,2)} minutes.",
      f"There are {df_ela_6_9[(df_ela_6_9['Total_Test_Time']<=outlier_lower)|(df_ela_6_9['Total_Test_Time']>=outlier_upper)].shape[0]} outliers in the dataset.")

An outlier in the for the NJSLA ELA is a Test Scale Score greater than or equal to 235.52 minutes ,or a Test Scale Score less than or equal to 3.52 minutes. There are 6 outliers in the dataset.


In [None]:
# displaying outliers
df_ela_6_9[(df_ela_6_9['Total_Test_Time']<=outlier_lower)|(df_ela_6_9['Total_Test_Time']>=outlier_upper)][['FirstName','LastOrSurname','TestingSchoolCode','GradeLevelWhenAssessed','StudentWithDisabilities','Total_Test_Time']]

|Testing school Code|School Name
|---|---|
|060|Benjamin Franklin Middle School|
|070|George Washington Middle School|

In [None]:
# removing outliers
df_ela_6_9 = df_ela_6_9[(df_ela_6_9['Total_Test_Time']>outlier_lower)&(df_ela_6_9['Total_Test_Time'] < outlier_upper)]

In [None]:
df_ela_6_9['Total_Test_Time'].describe()

In [None]:
# statistical metrics
mean = round(df_ela_6_9['Total_Test_Time'].mean(),2)
median = round(df_ela_6_9['Total_Test_Time'].median(),2)

# plotting Test Scale Score Distribution
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (20,6))

df_ela_6_9['Total_Test_Time'].plot(kind = 'box', vert = False, title = f'{subject} Total Test Time Box Plot', ax = ax[0])
df_ela_6_9['Total_Test_Time'].plot(kind = 'hist', title = f'{subject} Total Test Time Score Histogram', ax = ax[1], alpha = .5)

ax[1].axvline(x = mean, linestyle = '--', color = 'red')
ax[1].axvline(x = median, linestyle= '-', color = 'green')

plt.legend(['Frequnecy','Mean','Median'])

plt.show()

In [None]:
print(f"The typical test time for the NJSLA {subject} is {round(median,2)} minutes",
     f"\nThe typical testing time for students who were proficient in {subject} on the NJSLA is {round(df_ela_6_9[df_ela_6_9['proficient']==1]['Total_Test_Time'].median(),2)} minutes.",
     f"\nThe typical testing_time_for students who were not proficient in {subject} on the NJSLA is {round(df_ela_6_9[df_ela_6_9['proficient']==0]['Total_Test_Time'].median(),2)} minutes.",
     f"\nThat is a difference of {round(df_ela_6_9[df_ela_6_9['proficient']==1]['Total_Test_Time'].median()-df_ela_6_9[df_ela_6_9['proficient']==0]['Total_Test_Time'].median(),2)} additional minutes spent testing for students who were proficient in {subject} on the NJSLA ")

In [None]:
# creating ols formula
ols_formula = "TestScaleScore ~ Total_Test_Time"

# creating ols object
OLS = ols(formula = ols_formula, data = df_ela_6_9)

# fitting model
model = OLS.fit()

# model summary
model_results = model.summary()

model_results

In [None]:
print(f"The correlation coefficient between testing time and scale score is {df_ela_6_9['Total_Test_Time'].corr(df_ela_6_9['TestScaleScore']):.2}, indicative of a moderate positive correlation",
     f"\n\nThe p-value is less than 0.05 refuting the null hypothesis and confirming that there is a statistical significance in the relationship between the independent and dependent variables.",
     f"\n\nThe r-squared value is {model.rsquared:.3} is relatively high, and means that the independent variable accounts for {model.rsquared:.1%} of the variable in the dependent variable.",
     f"\n\nLastly based on the model for every additional minute spent testing there is approximate score increase of 0.49 in the Test Scale Score ")

### Linear Assumptions:
1. **Linearity** - Each predictor value is linearly related to the outcome variable
2. **Normality** - The errors are normally distributed.
3. **Independent** Observations - Each observation in the dataset is independent.
4. **Homoscedasticity** - The variance of the errors is constant or similar across the model

In [None]:
# testing normality
residuals = model.resid
fig = sm.qqplot(residuals, line = 's')
plt.title('Model Residual Distributions - QQ Plot')
plt.show()

In [None]:
# Homoscedasticity
fitted_values = model.predict(df_ela_6_9['Total_Test_Time'])
fig = sns.scatterplot(x = fitted_values, y = residuals)
fig.axhline(0, color = 'red')
fig.set_xlabel('fitted_values')
fig.set_ylabel('residuals')
fig.set_title('Homoscedasticity Scatterplot')
plt.show()

In [None]:
# creating predictions based on the model

# creating the dataframe for predictions
df_ela_pred = pd.DataFrame({
    'Total_Test_Time':df_ela_6_9['Total_Test_Time']})
# sorting
df_ela_pred = df_ela_pred.sort_values(by = 'Total_Test_Time')

# #adding p redictions
df_ela_pred['Predicted Scale Score'] = model.predict(df_ela_pred['Total_Test_Time'])

# adding residuals to the dataframe
df_ela_pred['Residuals'] = model.resid

#residual statistics
df_ela_pred['Residuals'].describe()

In [None]:
fig = plt.figure(figsize = (15,6))

sns.regplot(x = 'Total_Test_Time', y = 'TestScaleScore', data = df_ela_6_9)
plt.title('Testing Time and Test Scale Score Scatter Linear Regression')
plt.xlabel('Testing Time')
plt.ylabel('Test Scale Score')

sns.scatterplot(data = df_ela_pred, x = 'Total_Test_Time', y = 'Predicted Scale Score', color = 'red')
plt.legend(['Scale Score','Line of Best Fit','Confidence Band','Scale Score Predictions'])
plt.show()

plt.show()