In [54]:
#Importing necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
import seaborn as sns
import statsmodels.formula.api as sm
pd.set_option('display.float_format', lambda x: '%.3f' % x) #suppresses scientific notation

In [40]:
#Preparing dataset
all_data = pd.read_csv("AttyFeeStudy2007.csv")
data = all_data[["AssetsScheduled", "LiabScheduled", "AssetLiab", "XnumEmplBefore", 
                           "Sales", "PlanClasses", "XShop", "XDaysIn", "DENYOther", "XPrepackaged",
                           "FeeCommi", "Role", "FeeExpOrd"]]
data_without_plan_classes = data.drop("PlanClasses", axis = 1)
data.at[23, "Sales"] =  100789000000
data = data.drop([63, 83])

In [41]:
#Creating dataframes of independent variables (X) and dependent variable (y)
X = data.iloc[: , 0:12]
y = data["FeeExpOrd"]

In [42]:
#Encoding categorical variables

#Encoding categorical data: XShop
def encode_XShop(array):
    vals = []
    for x in array:
        if x == "Yes":
            vals.append(1)
        else:
            vals.append(0)
    return np.array(vals)

encoded_XShop = encode_XShop(X["XShop"])
X["Encoded_XShop"] = encoded_XShop

#Encoding categorical data: DENYOther, Other will be excluded (default)
def encode_DENYOther(array):
    DE_vals = []
    NY_vals = []
    for x in array:
        if x == "DE":
            DE_vals.append(1)
            NY_vals.append(0)
        elif x == "NY":
            DE_vals.append(0)
            NY_vals.append(1)
        else:
            DE_vals.append(0)
            NY_vals.append(0)
    return np.array(DE_vals), np.array(NY_vals)

encoded_DE = encode_DENYOther(X["DENYOther"])[0]
encoded_NY = encode_DENYOther(X["DENYOther"])[1]
X["Encoded_DE"] = encoded_DE 
X["Encoded_NY"] = encoded_NY

#Encoding categorical data: XPrepackaged, no will be excluded (default)
def encode_XPrepackaged(array):
    yes_vals = []
    preneg_vals = []
    for x in array:
        if x == "yes":
            yes_vals.append(1)
            preneg_vals.append(0)
        elif x == "prenegotiated":
            yes_vals.append(0)
            preneg_vals.append(1)
        else:
            yes_vals.append(0)
            preneg_vals.append(0)
    return np.array(yes_vals), np.array(preneg_vals)

encoded_yes = encode_XPrepackaged(X["XPrepackaged"])[0]
encoded_preneg = encode_XPrepackaged(X["XPrepackaged"])[1]
X["Encoded_Prepackaged_yes"] = encoded_yes
X["Encoded_Prepackaged_prenegotiated"] = encoded_preneg

#Encoding categorical data: FeeCommi, no will be excluded (default)
def encode_FeeCommi(array):
    yes_vals = []
    for x in array:
        if x == "yes":
            yes_vals.append(1)
        else:
            yes_vals.append(0)
    return np.array(yes_vals)
encoded_FeeCommi = encode_FeeCommi(X["FeeCommi"])
X["Encoded_FeeCommi"] = encoded_FeeCommi

X = X.drop(["XShop", "DENYOther", "XPrepackaged", "FeeCommi"], axis = 1)
X = X.astype(float)

In [55]:
#Analyzing colinearity among independent varibles: Something we want to avoid.
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIFs = pd.DataFrame()
VIFs["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIFs["Variables"] = X.columns
VIFs

Unnamed: 0,VIF,Variables
0,37237219258.335,AssetsScheduled
1,18069256773.045,LiabScheduled
2,102482640286.051,AssetLiab
3,1.863,XnumEmplBefore
4,6.742,Sales
5,5.027,PlanClasses
6,4.96,XDaysIn
7,7.623,Role
8,4.971,Encoded_XShop
9,3.085,Encoded_DE


In [63]:
#Retrying VIF analysis by removing LiabScheduled and AssetLiab

X2 = X.drop(["LiabScheduled", "AssetLiab"], axis = 1)
VIFs2 = pd.DataFrame()
VIFs2["VIF"] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
VIFs2["Variables"] = X2.columns
VIFs2

Unnamed: 0,VIF,Variables
0,2.592,AssetsScheduled
1,1.796,XnumEmplBefore
2,5.581,Sales
3,4.266,PlanClasses
4,4.553,XDaysIn
5,7.473,Role
6,4.846,Encoded_XShop
7,3.061,Encoded_DE
8,2.088,Encoded_NY
9,1.116,Encoded_Prepackaged_yes


In [64]:
#Retrying VIF analysis by removing PlanClasses

X3 = X2.drop(["PlanClasses"], axis = 1)
VIFs3 = pd.DataFrame()
VIFs3["VIF"] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
VIFs3["Variables"] = X3.columns
VIFs3

Unnamed: 0,VIF,Variables
0,2.5,AssetsScheduled
1,1.643,XnumEmplBefore
2,2.264,Sales
3,4.195,XDaysIn
4,7.404,Role
5,4.735,Encoded_XShop
6,3.054,Encoded_DE
7,2.087,Encoded_NY
8,1.103,Encoded_Prepackaged_yes
9,1.593,Encoded_Prepackaged_prenegotiated


In [65]:
#Retrying VIF analysis by adding PlanClasses back and removing Role

X4 = X2.drop("Role", axis = 1)
VIFs4 = pd.DataFrame()
VIFs4["VIF"] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
VIFs4["Variables"] = X4.columns
VIFs4

Unnamed: 0,VIF,Variables
0,2.175,AssetsScheduled
1,1.796,XnumEmplBefore
2,5.494,Sales
3,4.227,PlanClasses
4,2.876,XDaysIn
5,4.454,Encoded_XShop
6,3.029,Encoded_DE
7,2.087,Encoded_NY
8,1.116,Encoded_Prepackaged_yes
9,1.661,Encoded_Prepackaged_prenegotiated


In [66]:
#Retrying VIF analysis by removing PlanClasses Again

X5 = X4.drop("PlanClasses", axis = 1)
VIFs5 = pd.DataFrame()
VIFs5["VIF"] = [variance_inflation_factor(X5.values, i) for i in range(X5.shape[1])]
VIFs5["Variables"] = X5.columns
VIFs5

Unnamed: 0,VIF,Variables
0,2.116,AssetsScheduled
1,1.642,XnumEmplBefore
2,2.041,Sales
3,2.35,XDaysIn
4,4.379,Encoded_XShop
5,3.025,Encoded_DE
6,2.086,Encoded_NY
7,1.103,Encoded_Prepackaged_yes
8,1.564,Encoded_Prepackaged_prenegotiated
9,2.051,Encoded_FeeCommi


In [77]:
#Adding column of ones for constant

X5["Ones"] = np.ones(X5.shape[0])

In [79]:
#Testing with multivariate regression

regressor_OLS = sm.OLS(endog = y, exog = X5).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.847
Model:,OLS,Adj. R-squared:,0.83
Method:,Least Squares,F-statistic:,49.29
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,5.46e-32
Time:,11:22:05,Log-Likelihood:,-1909.6
No. Observations:,100,AIC:,3841.0
Df Residuals:,89,BIC:,3870.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,4002.2864,479.949,8.339,0.000,3048.638,4955.935
XnumEmplBefore,-400.9096,197.771,-2.027,0.046,-793.877,-7.942
Sales,0.0049,0.001,8.363,0.000,0.004,0.006
XDaysIn,9.96e+04,1.79e+04,5.577,0.000,6.41e+04,1.35e+05
Encoded_XShop,6.901e+06,1.35e+07,0.512,0.610,-1.99e+07,3.37e+07
Encoded_DE,-7.676e+06,1.55e+07,-0.496,0.621,-3.85e+07,2.31e+07
Encoded_NY,-4.506e+06,1.36e+07,-0.332,0.741,-3.15e+07,2.25e+07
Encoded_Prepackaged_yes,2.56e+07,2.78e+07,0.920,0.360,-2.97e+07,8.09e+07
Encoded_Prepackaged_prenegotiated,3.416e+06,1.3e+07,0.263,0.793,-2.23e+07,2.92e+07

0,1,2,3
Omnibus:,46.746,Durbin-Watson:,2.108
Prob(Omnibus):,0.0,Jarque-Bera (JB):,379.388
Skew:,1.181,Prob(JB):,4.14e-83
Kurtosis:,12.245,Cond. No.,70400000000.0


In [80]:
X_opt = X5.drop("Encoded_Prepackaged_prenegotiated", axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.847
Model:,OLS,Adj. R-squared:,0.832
Method:,Least Squares,F-statistic:,55.33
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,7.36e-33
Time:,11:24:45,Log-Likelihood:,-1909.7
No. Observations:,100,AIC:,3839.0
Df Residuals:,90,BIC:,3865.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,4002.5176,477.460,8.383,0.000,3053.959,4951.076
XnumEmplBefore,-405.2991,196.047,-2.067,0.042,-794.780,-15.818
Sales,0.0049,0.001,8.408,0.000,0.004,0.006
XDaysIn,9.762e+04,1.61e+04,6.057,0.000,6.56e+04,1.3e+05
Encoded_XShop,7.102e+06,1.34e+07,0.530,0.597,-1.95e+07,3.37e+07
Encoded_DE,-7.298e+06,1.53e+07,-0.476,0.635,-3.78e+07,2.32e+07
Encoded_NY,-3.751e+06,1.32e+07,-0.284,0.777,-3e+07,2.25e+07
Encoded_Prepackaged_yes,2.347e+07,2.65e+07,0.886,0.378,-2.92e+07,7.61e+07
Encoded_FeeCommi,1.272e+07,1.5e+07,0.851,0.397,-1.7e+07,4.24e+07

0,1,2,3
Omnibus:,47.551,Durbin-Watson:,2.114
Prob(Omnibus):,0.0,Jarque-Bera (JB):,381.469
Skew:,1.216,Prob(JB):,1.4599999999999999e-83
Kurtosis:,12.254,Cond. No.,65800000000.0


In [81]:
X_opt = X_opt.drop("Encoded_NY", axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.847
Model:,OLS,Adj. R-squared:,0.833
Method:,Least Squares,F-statistic:,62.88
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,9.31e-34
Time:,11:25:21,Log-Likelihood:,-1909.7
No. Observations:,100,AIC:,3837.0
Df Residuals:,91,BIC:,3861.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,3989.3062,472.786,8.438,0.000,3050.174,4928.438
XnumEmplBefore,-397.9635,193.356,-2.058,0.042,-782.042,-13.885
Sales,0.0048,0.001,8.451,0.000,0.004,0.006
XDaysIn,9.772e+04,1.6e+04,6.095,0.000,6.59e+04,1.3e+05
Encoded_XShop,6.022e+06,1.28e+07,0.471,0.638,-1.94e+07,3.14e+07
Encoded_DE,-5.123e+06,1.32e+07,-0.387,0.700,-3.14e+07,2.12e+07
Encoded_Prepackaged_yes,2.34e+07,2.64e+07,0.888,0.377,-2.9e+07,7.58e+07
Encoded_FeeCommi,1.272e+07,1.49e+07,0.855,0.395,-1.68e+07,4.23e+07
Ones,-3.125e+07,1.07e+07,-2.911,0.005,-5.26e+07,-9.92e+06

0,1,2,3
Omnibus:,47.733,Durbin-Watson:,2.118
Prob(Omnibus):,0.0,Jarque-Bera (JB):,377.823
Skew:,1.228,Prob(JB):,9.05e-83
Kurtosis:,12.2,Cond. No.,65800000000.0


In [83]:
X_opt = X_opt.drop("Encoded_DE", axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.847
Model:,OLS,Adj. R-squared:,0.835
Method:,Least Squares,F-statistic:,72.51
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,1.13e-34
Time:,11:25:56,Log-Likelihood:,-1909.8
No. Observations:,100,AIC:,3836.0
Df Residuals:,92,BIC:,3856.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,4021.3347,463.337,8.679,0.000,3101.107,4941.562
XnumEmplBefore,-394.4713,192.251,-2.052,0.043,-776.299,-12.644
Sales,0.0049,0.001,8.563,0.000,0.004,0.006
XDaysIn,9.816e+04,1.59e+04,6.166,0.000,6.65e+04,1.3e+05
Encoded_XShop,3.232e+06,1.05e+07,0.308,0.759,-1.76e+07,2.41e+07
Encoded_Prepackaged_yes,2.323e+07,2.62e+07,0.885,0.378,-2.89e+07,7.53e+07
Encoded_FeeCommi,1.254e+07,1.48e+07,0.847,0.399,-1.69e+07,4.19e+07
Ones,-3.162e+07,1.06e+07,-2.970,0.004,-5.28e+07,-1.05e+07

0,1,2,3
Omnibus:,49.497,Durbin-Watson:,2.138
Prob(Omnibus):,0.0,Jarque-Bera (JB):,397.194
Skew:,1.288,Prob(JB):,5.63e-87
Kurtosis:,12.417,Cond. No.,65800000000.0


In [84]:
X_opt = X_opt.drop("Encoded_XShop", axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.836
Method:,Least Squares,F-statistic:,85.41
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,1.22e-35
Time:,11:26:13,Log-Likelihood:,-1909.8
No. Observations:,100,AIC:,3834.0
Df Residuals:,93,BIC:,3852.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,4016.0506,460.760,8.716,0.000,3101.073,4931.029
XnumEmplBefore,-392.3881,191.195,-2.052,0.043,-772.063,-12.713
Sales,0.0049,0.001,8.661,0.000,0.004,0.006
XDaysIn,9.833e+04,1.58e+04,6.211,0.000,6.69e+04,1.3e+05
Encoded_Prepackaged_yes,2.377e+07,2.61e+07,0.912,0.364,-2.8e+07,7.55e+07
Encoded_FeeCommi,1.278e+07,1.47e+07,0.869,0.387,-1.64e+07,4.2e+07
Ones,-2.976e+07,8.73e+06,-3.407,0.001,-4.71e+07,-1.24e+07

0,1,2,3
Omnibus:,50.102,Durbin-Watson:,2.137
Prob(Omnibus):,0.0,Jarque-Bera (JB):,405.616
Skew:,1.307,Prob(JB):,8.349999999999999e-89
Kurtosis:,12.514,Cond. No.,65800000000.0


In [85]:
X_opt = X_opt.drop("Encoded_FeeCommi", axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.845
Model:,OLS,Adj. R-squared:,0.837
Method:,Least Squares,F-statistic:,102.6
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,1.66e-36
Time:,11:26:34,Log-Likelihood:,-1910.3
No. Observations:,100,AIC:,3833.0
Df Residuals:,94,BIC:,3848.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,4139.2487,437.843,9.454,0.000,3269.902,5008.595
XnumEmplBefore,-352.3661,185.325,-1.901,0.060,-720.333,15.601
Sales,0.0049,0.001,8.804,0.000,0.004,0.006
XDaysIn,1.014e+05,1.54e+04,6.587,0.000,7.09e+04,1.32e+05
Encoded_Prepackaged_yes,2.323e+07,2.6e+07,0.893,0.374,-2.84e+07,7.49e+07
Ones,-2.965e+07,8.72e+06,-3.400,0.001,-4.7e+07,-1.23e+07

0,1,2,3
Omnibus:,54.93,Durbin-Watson:,2.114
Prob(Omnibus):,0.0,Jarque-Bera (JB):,456.587
Skew:,1.479,Prob(JB):,7.13e-100
Kurtosis:,13.042,Cond. No.,65700000000.0


In [86]:
X_opt = X_opt.drop("Encoded_Prepackaged_yes", axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.844
Model:,OLS,Adj. R-squared:,0.837
Method:,Least Squares,F-statistic:,128.3
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,2.04e-37
Time:,11:27:09,Log-Likelihood:,-1910.7
No. Observations:,100,AIC:,3831.0
Df Residuals:,95,BIC:,3844.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,4122.7574,436.987,9.435,0.000,3255.228,4990.287
XnumEmplBefore,-352.4762,185.127,-1.904,0.060,-720.001,15.048
Sales,0.0049,0.001,8.832,0.000,0.004,0.006
XDaysIn,9.82e+04,1.49e+04,6.568,0.000,6.85e+04,1.28e+05
Ones,-2.72e+07,8.27e+06,-3.289,0.001,-4.36e+07,-1.08e+07

0,1,2,3
Omnibus:,56.373,Durbin-Watson:,2.124
Prob(Omnibus):,0.0,Jarque-Bera (JB):,476.133
Skew:,1.527,Prob(JB):,4.0599999999999997e-104
Kurtosis:,13.244,Cond. No.,20800000000.0


In [87]:
X_opt = X_opt.drop("XnumEmplBefore", axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.838
Model:,OLS,Adj. R-squared:,0.833
Method:,Least Squares,F-statistic:,165.4
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,8.549999999999999e-38
Time:,11:27:37,Log-Likelihood:,-1912.5
No. Observations:,100,AIC:,3833.0
Df Residuals:,96,BIC:,3844.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
AssetsScheduled,4089.2362,442.562,9.240,0.000,3210.758,4967.715
Sales,0.0046,0.001,8.559,0.000,0.004,0.006
XDaysIn,9.401e+04,1.5e+04,6.272,0.000,6.43e+04,1.24e+05
Ones,-2.869e+07,8.35e+06,-3.438,0.001,-4.53e+07,-1.21e+07

0,1,2,3
Omnibus:,56.93,Durbin-Watson:,2.122
Prob(Omnibus):,0.0,Jarque-Bera (JB):,475.3
Skew:,1.554,Prob(JB):,6.169999999999999e-104
Kurtosis:,13.218,Cond. No.,20700000000.0


In [90]:
X_opt["XnumEmplBefore"] = X5["XnumEmplBefore"]
X_opt = X_opt.drop("AssetsScheduled", axis = 1)
X_opt["LiabScheduled"] = X["LiabScheduled"]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.782
Model:,OLS,Adj. R-squared:,0.773
Method:,Least Squares,F-statistic:,85.14
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,1.47e-30
Time:,11:30:01,Log-Likelihood:,-1927.4
No. Observations:,100,AIC:,3865.0
Df Residuals:,95,BIC:,3878.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Sales,0.0062,0.001,10.090,0.000,0.005,0.007
XDaysIn,9.927e+04,1.77e+04,5.617,0.000,6.42e+04,1.34e+05
Ones,-2.881e+07,9.85e+06,-2.926,0.004,-4.84e+07,-9.26e+06
XnumEmplBefore,-379.9217,219.196,-1.733,0.086,-815.082,55.238
LiabScheduled,4296.3234,708.655,6.063,0.000,2889.466,5703.181

0,1,2,3
Omnibus:,63.276,Durbin-Watson:,2.068
Prob(Omnibus):,0.0,Jarque-Bera (JB):,427.741
Skew:,1.905,Prob(JB):,1.31e-93
Kurtosis:,12.389,Cond. No.,20900000000.0


In [92]:
X_opt = X_opt.drop("LiabScheduled", axis = 1)
X_opt["AssetLiab"] = X["AssetLiab"]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.825
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,111.6
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,4.97e-35
Time:,11:31:01,Log-Likelihood:,-1916.5
No. Observations:,100,AIC:,3843.0
Df Residuals:,95,BIC:,3856.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Sales,0.0053,0.001,9.213,0.000,0.004,0.006
XDaysIn,9.804e+04,1.58e+04,6.186,0.000,6.66e+04,1.29e+05
Ones,-2.88e+07,8.79e+06,-3.275,0.001,-4.63e+07,-1.13e+07
XnumEmplBefore,-373.9276,196.354,-1.904,0.060,-763.740,15.885
AssetLiab,4609.9638,555.682,8.296,0.000,3506.795,5713.132

0,1,2,3
Omnibus:,49.051,Durbin-Watson:,2.103
Prob(Omnibus):,0.0,Jarque-Bera (JB):,393.526
Skew:,1.272,Prob(JB):,3.52e-86
Kurtosis:,12.38,Cond. No.,20900000000.0


In [96]:
X_opt["AssetsScheduled"] = X5["AssetsScheduled"]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.844
Model:,OLS,Adj. R-squared:,0.837
Method:,Least Squares,F-statistic:,128.3
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,2.04e-37
Time:,11:32:51,Log-Likelihood:,-1910.7
No. Observations:,100,AIC:,3831.0
Df Residuals:,95,BIC:,3844.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Sales,0.0049,0.001,8.832,0.000,0.004,0.006
XDaysIn,9.82e+04,1.49e+04,6.568,0.000,6.85e+04,1.28e+05
Ones,-2.72e+07,8.27e+06,-3.289,0.001,-4.36e+07,-1.08e+07
XnumEmplBefore,-352.4762,185.127,-1.904,0.060,-720.001,15.048
AssetsScheduled,4122.7574,436.987,9.435,0.000,3255.228,4990.287

0,1,2,3
Omnibus:,56.373,Durbin-Watson:,2.124
Prob(Omnibus):,0.0,Jarque-Bera (JB):,476.133
Skew:,1.527,Prob(JB):,4.0599999999999997e-104
Kurtosis:,13.244,Cond. No.,20800000000.0


In [97]:
X_opt_for_vif = X_opt.drop("Ones", axis = 1)
VIFs6 = pd.DataFrame()
VIFs6["VIF"] = [variance_inflation_factor(X_opt_for_vif.values, i) for i in range(X_opt_for_vif.shape[1])]
VIFs6["Variables"] = X_opt_for_vif.columns
VIFs6

Unnamed: 0,VIF,Variables
0,1.971,Sales
1,1.347,XDaysIn
2,1.514,XnumEmplBefore
3,1.823,AssetsScheduled


In [98]:
X_opt["Role"] = X["Role"]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,FeeExpOrd,R-squared:,0.884
Model:,OLS,Adj. R-squared:,0.878
Method:,Least Squares,F-statistic:,143.0
Date:,"Tue, 17 Jul 2018",Prob (F-statistic):,2.3999999999999997e-42
Time:,11:35:00,Log-Likelihood:,-1895.9
No. Observations:,100,AIC:,3804.0
Df Residuals:,94,BIC:,3819.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Sales,0.0038,0.001,7.324,0.000,0.003,0.005
XDaysIn,5.09e+04,1.54e+04,3.306,0.001,2.03e+04,8.15e+04
Ones,-4.955e+07,8.18e+06,-6.061,0.000,-6.58e+07,-3.33e+07
XnumEmplBefore,-355.1174,160.508,-2.212,0.029,-673.810,-36.424
AssetsScheduled,3013.1367,426.113,7.071,0.000,2167.080,3859.193
Role,3.591e+06,6.31e+05,5.690,0.000,2.34e+06,4.84e+06

0,1,2,3
Omnibus:,27.299,Durbin-Watson:,2.122
Prob(Omnibus):,0.0,Jarque-Bera (JB):,304.299
Skew:,0.002,Prob(JB):,8.36e-67
Kurtosis:,11.546,Cond. No.,23700000000.0


In [100]:
X_opt_for_vif2 = X_opt.drop("Ones", axis = 1)
VIFs7 = pd.DataFrame()
VIFs7["VIF"] = [variance_inflation_factor(X_opt_for_vif2.values, i) for i in range(X_opt_for_vif2.shape[1])]
VIFs7["Variables"] = X_opt_for_vif2.columns
VIFs7

Unnamed: 0,VIF,Variables
0,2.185,Sales
1,3.805,XDaysIn
2,1.517,XnumEmplBefore
3,2.28,AssetsScheduled
4,6.038,Role


## Final variable choices: Sales, XDaysIn, XNumEmplBefore, AssetsScheduled, Ones