In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split

In [37]:
### Interpreting Coefficients, Stats Model OLS ###
from statsmodels.api import OLS
grad = pd.read_csv('Feature_Target_Data.csv')
years = [2011, 2012, 2013, 2014]
grad = grad.loc[grad['Year'].isin(years)]
grad = grad[grad.columns[3:]]
X = grad[grad.columns[:-1]]
y = grad[grad.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
olsreg = OLS(y_train, X_train)
olsreg = olsreg.fit()
print(olsreg.summary())

                                  OLS Regression Results                                 
Dep. Variable:     Graduated 4-Year (%)   R-squared (uncentered):                   0.960
Model:                              OLS   Adj. R-squared (uncentered):              0.959
Method:                   Least Squares   F-statistic:                              744.0
Date:                  Fri, 13 Mar 2020   Prob (F-statistic):                   4.51e-250
Time:                          16:28:37   Log-Likelihood:                         -1349.1
No. Observations:                   381   AIC:                                      2722.
Df Residuals:                       369   BIC:                                      2770.
Df Model:                            12                                                  
Covariance Type:              nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]


In [38]:
### Studying Effects of Dropping Certain Features Due to Multicollinearity/Domain Insignificance ###
grad = grad.drop(['AP-Total Exams', 'Enrolled 4-Year', 'Total Graduated', 'AP-11&12 Participating Students', 'SAT-Total', 'Wealth/ADA'], axis=1)
X = grad[grad.columns[:-1]]
y = grad[grad.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
olsreg = OLS(y_train, X_train)
olsreg = olsreg.fit()
print(olsreg.summary())

                                  OLS Regression Results                                 
Dep. Variable:     Graduated 4-Year (%)   R-squared (uncentered):                   0.959
Model:                              OLS   Adj. R-squared (uncentered):              0.958
Method:                   Least Squares   F-statistic:                              1448.
Date:                  Fri, 13 Mar 2020   Prob (F-statistic):                   7.61e-256
Time:                          16:28:41   Log-Likelihood:                         -1357.1
No. Observations:                   381   AIC:                                      2726.
Df Residuals:                       375   BIC:                                      2750.
Df Model:                             6                                                  
Covariance Type:              nonrobust                                                  
                                 coef    std err          t      P>|t|      [0.025      0.975]
-----

In [9]:
########## Linear Regression Model for Predicting Graduated 4-Year (%) ############
### ALL Features ###
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

for test_size in [.325, .3, .275, .25, .225]:
    print('\ntest_size: ', test_size)
    grad = pd.read_csv('Feature_Target_Data.csv')
    years = [2011, 2012, 2013, 2014]
    grad = grad.loc[grad['Year'].isin(years)]
    grad = grad[grad.columns[3:]]
    X = grad.drop('Graduated 4-Year (%)', axis=1).values
    y = grad['Graduated 4-Year (%)'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    linear_reg = LinearRegression()
    linear_reg.fit(X_train, y_train)
    y_pred_test = linear_reg.predict(X_test)
    y_pred_train = linear_reg.predict(X_train)
    print('---All Features---', '\n')
    print('Training RMSE:', np.sqrt(mean_squared_error(y_train, y_pred_train)), 'Testing RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_test)), 'Difference:', abs(np.sqrt(mean_squared_error(y_train, y_pred_train)) - np.sqrt(mean_squared_error(y_test, y_pred_test))))
    print('\nTraining R2: ', linear_reg.score(X_train, y_train), 'Testing R2: ', linear_reg.score(X_test, y_test), 'Difference:', abs(linear_reg.score(X_train, y_train) - linear_reg.score(X_test, y_test)))
    print('\nTraining MAPE: ', mean_absolute_percentage_error(y_train, y_pred_train), 'Testing MAPE: ', mean_absolute_percentage_error(y_test, y_pred_test), 'Difference:', abs(mean_absolute_percentage_error(y_train, y_pred_train) - mean_absolute_percentage_error(y_test, y_pred_test)))

### After Dropping Features Causing Mulicollinearity/Insignificance ###
    X = grad.drop(['Graduated 4-Year (%)', 'AP-Total Exams', 'Enrolled 4-Year', 'Total Graduated', 'AP-11&12 Participating Students', 'SAT-Total', 'Wealth/ADA'], axis=1).values
    y = grad['Graduated 4-Year (%)'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    linear_reg = LinearRegression()
    linear_reg.fit(X_train, y_train)
    y_pred_test = linear_reg.predict(X_test)
    y_pred_train = linear_reg.predict(X_train)
    print('\n', '---After Dropping Features---', '\n')
    print('Training RMSE:', np.sqrt(mean_squared_error(y_train, y_pred_train)), 'Testing RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_test)), 'Difference:', abs(np.sqrt(mean_squared_error(y_train, y_pred_train)) - np.sqrt(mean_squared_error(y_test, y_pred_test))))
    print('\nTraining R2: ', linear_reg.score(X_train, y_train), 'Testing R2: ', linear_reg.score(X_test, y_test), 'Difference:', abs(linear_reg.score(X_train, y_train) - linear_reg.score(X_test, y_test)))
    print('\nTraining MAPE: ', mean_absolute_percentage_error(y_train, y_pred_train), 'Testing MAPE: ', mean_absolute_percentage_error(y_test, y_pred_test), 'Difference:', abs(mean_absolute_percentage_error(y_train, y_pred_train) - mean_absolute_percentage_error(y_test, y_pred_test)))




test_size:  0.325
---All Features--- 

Training RMSE: 8.22427289729042 Testing RMSE: 7.076394216935626 Difference: 1.1478786803547942

Training R2:  0.5613787027319719 Testing R2:  0.6865052072040615 Difference: 0.12512650447208962

Training MAPE:  18.191129263242825 Testing MAPE:  15.905683997719485 Difference: 2.2854452655233395

 ---After Dropping Features--- 

Training RMSE: 8.510824902215692 Testing RMSE: 6.964779072266455 Difference: 1.5460458299492368

Training R2:  0.5302811368951164 Testing R2:  0.6963166487196456 Difference: 0.16603551182452925

Training MAPE:  18.619150250025406 Testing MAPE:  15.643188970542155 Difference: 2.9759612794832506

test_size:  0.3
---All Features--- 

Training RMSE: 8.125371609673968 Testing RMSE: 7.211472957627714 Difference: 0.913898652046254

Training R2:  0.5759426378754136 Testing R2:  0.6694028154431887 Difference: 0.09346017756777514

Training MAPE:  17.902627557682482 Testing MAPE:  16.289719551681827 Difference: 1.612908006000655

 ---A