In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split

In [37]:
### Interpreting Coefficients, Stats Model OLS ###
from statsmodels.api import OLS
grad = pd.read_csv('Feature_Target_Data.csv')
years = [2011, 2012, 2013, 2014]
grad = grad.loc[grad['Year'].isin(years)]
grad = grad[grad.columns[3:]]
X = grad[grad.columns[:-1]]
y = grad[grad.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
olsreg = OLS(y_train, X_train)
olsreg = olsreg.fit()
print(olsreg.summary())

                                  OLS Regression Results                                 
Dep. Variable:     Graduated 4-Year (%)   R-squared (uncentered):                   0.960
Model:                              OLS   Adj. R-squared (uncentered):              0.959
Method:                   Least Squares   F-statistic:                              744.0
Date:                  Fri, 13 Mar 2020   Prob (F-statistic):                   4.51e-250
Time:                          16:28:37   Log-Likelihood:                         -1349.1
No. Observations:                   381   AIC:                                      2722.
Df Residuals:                       369   BIC:                                      2770.
Df Model:                            12                                                  
Covariance Type:              nonrobust                                                  
                                      coef    std err          t      P>|t|      [0.025      0.975]


In [38]:
### Studying Effects of Dropping Certain Features Due to Multicollinearity/Domain Insignificance ###
grad = grad.drop(['AP-Total Exams', 'Enrolled 4-Year', 'Total Graduated', 'AP-11&12 Participating Students', 'SAT-Total', 'Wealth/ADA'], axis=1)
X = grad[grad.columns[:-1]]
y = grad[grad.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
olsreg = OLS(y_train, X_train)
olsreg = olsreg.fit()
print(olsreg.summary())

                                  OLS Regression Results                                 
Dep. Variable:     Graduated 4-Year (%)   R-squared (uncentered):                   0.959
Model:                              OLS   Adj. R-squared (uncentered):              0.958
Method:                   Least Squares   F-statistic:                              1448.
Date:                  Fri, 13 Mar 2020   Prob (F-statistic):                   7.61e-256
Time:                          16:28:41   Log-Likelihood:                         -1357.1
No. Observations:                   381   AIC:                                      2726.
Df Residuals:                       375   BIC:                                      2750.
Df Model:                             6                                                  
Covariance Type:              nonrobust                                                  
                                 coef    std err          t      P>|t|      [0.025      0.975]
-----

In [41]:
########## Linear Regression Model for Predicting Graduated 4-Year (%) ############
### ALL Features ###
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

grad = pd.read_csv('Feature_Target_Data.csv')
years = [2011, 2012, 2013, 2014]
grad = grad.loc[grad['Year'].isin(years)]
grad = grad[grad.columns[3:]]
X = grad.drop('Graduated 4-Year (%)', axis=1).values
y = grad['Graduated 4-Year (%)'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_test = linear_reg.predict(X_test)
y_pred_train = linear_reg.predict(X_train)
print('---All Features---', '\n')
print('Training RMSE:', np.sqrt(mean_squared_error(y_train, y_pred_train)), '\nTesting RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('\nTraining R2: ', linear_reg.score(X_train, y_train), '\nTesting R2: ', linear_reg.score(X_test, y_test))
print('\nTraining MAPE: ', mean_absolute_percentage_error(y_train, y_pred_train), '\nTesting MAPE: ', mean_absolute_percentage_error(y_test, y_pred_test))

### After Dropping Features Causing Mulicollinearity/Insignificance ###
X = grad.drop(['Graduated 4-Year (%)', 'AP-Total Exams', 'Enrolled 4-Year', 'Total Graduated', 'AP-11&12 Participating Students', 'SAT-Total', 'Wealth/ADA'], axis=1).values
y = grad['Graduated 4-Year (%)'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_test = linear_reg.predict(X_test)
y_pred_train = linear_reg.predict(X_train)
print('\n', '---After Dropping Features---', '\n')
print('Training RMSE:', np.sqrt(mean_squared_error(y_train, y_pred_train)), '\nTesting RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_test)))
print('\nTraining R2: ', linear_reg.score(X_train, y_train), '\nTesting R2: ', linear_reg.score(X_test, y_test))
print('\nTraining MAPE: ', mean_absolute_percentage_error(y_train, y_pred_train), '\nTesting MAPE: ', mean_absolute_percentage_error(y_test, y_pred_test))



---All Features--- 

Training RMSE: 8.039580002590322 
Testing RMSE: 7.270472881050807

Training R2:  0.5845709858118027 
Testing R2:  0.6667216573402157

Training MAPE:  17.648364646323063 
Testing MAPE:  16.839248356026104

 ---After Dropping Features--- 

Training RMSE: 8.340827253346166 
Testing RMSE: 7.012582803482295

Training R2:  0.552855023741984 
Testing R2:  0.6899456848554524

Training MAPE:  18.27036806499571 
Testing MAPE:  15.734016811248416
