In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_predict, GridSearchCV
import statsmodels.api as sm
from scipy import stats
import pandas as pd
import numpy as np
import warnings
from statsmodels.stats.outliers_influence import variance_inflation_factor
warnings.filterwarnings("ignore")

In [195]:
#Reading in the Data
df_med = pd.read_csv('./Interim Data/logged_outlier_removed.csv').astype('float')

# X and y creation
y = df_med['SalePrice']
X = df_med.drop(['SalePrice'],axis=1)

In [3]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.948
Model:                            OLS   Adj. R-squared:                  0.937
Method:                 Least Squares   F-statistic:                     87.67
Date:                Thu, 29 Jun 2023   Prob (F-statistic):               0.00
Time:                        16:41:58   Log-Likelihood:                 1417.9
No. Observations:                1458   AIC:                            -2336.
Df Residuals:                    1208   BIC:                            -1015.
Df Model:                         249                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Norm                

In [8]:
lm = LinearRegression()
y_pred = cross_val_predict(lm,X2,y,cv=5)
mean_absolute_error(np.exp(y_pred),np.exp(y))

14919.444464886405

In [10]:
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X2, y, test_size = 0.3)

#Definind the Find Alpha Function

def alpha_find(model, is_std, min_value = 0.001,
        max_value = 0.01,
        max_round = 2,
        step = 0.001):
    #Keep expanding the range until told to stop.
    param_candidates = []
    while 1 == 1:
        alpha = np.arange(min_value, max_value, step)
        param_grid = {'alpha': alpha}

        model_cv = GridSearchCV(model,param_grid,cv=5)
        
        #If is_std = True, use the standardized dataset. Else use the original
        if is_std == True:
            X_train, y_train = X_train_std, y_train_std
        else:
            X_train, y_train = X_train_orig, y_train_orig
        model_cv.fit(X_train,y_train)

        best_alpha = model_cv.best_params_['alpha']
        
        #In case of borderline values, if best_alpha's been seen before then use that.
        if best_alpha in param_candidates:
            break
        else:
            param_candidates.append(best_alpha)
        
        #If best_param = max value, then expand range. If min value, reduce range. Else break.
        if (round(best_alpha, max_round) == max_value):
            min_value, max_value, step, max_round = min_value*10, max_value*10, step*10, max_round-1
        elif ((best_alpha) == min_value):
            min_value, max_value, step, max_round = min_value/10, max_value/10, step/10, max_round+1
        else:
            break
    print('Best Alpha for Model (Standarized = {}): '.format(is_std),model_cv.best_params_)
    
alpha_find(Lasso(),False)

Best Alpha for Model (Standarized = False):  {'alpha': 0.001}


In [14]:
#Standardized
las = Lasso(alpha=0.001)
y_pred = cross_val_predict(las, X2, y, cv=5)
print('Standardized Dataset: ', mean_absolute_error(np.exp(y_pred), np.exp(y)))

Standardized Dataset:  13930.193739440849


In [130]:
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size = 0.3)

In [204]:
newX = pd.DataFrame({"Constant":np.ones(len(X))}).join(pd.DataFrame(X.reset_index(drop=True)))
feature_selections = []
for i in range(len(newX.columns),1,-1):
    
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(newX, y, test_size = 0.3)
    
    lm = LinearRegression()
    lm.fit(X_train_orig,y_train_orig)
    params = lm.coef_
    y_pred = cross_val_predict(lm,newX,y,cv=5)

    MSE = (sum((y_test_orig-predictions)**2))/(len(newX)-len(newX.columns))
    
    var_b = newX.var()/((newX - newX.mean())**2).sum()
    sd_b = np.sqrt(var_b)
    ts_b = params / sd_b

    p_values =2*(1-stats.t.cdf(np.abs(ts_b),newX.shape[0]-1))

    myDF3 = pd.DataFrame({'Coefficients':params, 'Standard Errors': sd_b, 't values':ts_b, 'Probabilities':p_values})
    try:
        feature_selections.append((i, newX.columns, mean_absolute_error(np.exp(y_pred),np.exp(y))))
    except ValueError:
        feature_selections.append((i, newX.columns, np.inf))
    try:
        newX = newX.drop(myDF3['Probabilities'].idxmax(),axis=1)
    except KeyError:
        break
        
print('The lowest MAE accomplished is {}, with {} parameters'.format(sorted(feature_selections, key=lambda x: x[2])[0][2], sorted(feature_selections, key=lambda x: x[2])[0][0]))

The lowest MAE accomplished is 14885.108823650367, with 247 parameters


In [191]:
cols = sorted(feature_selections, key=lambda x: x[2])[0][1]

In [203]:
lm = LinearRegression()
y_pred = cross_val_predict(lm,cols,y,cv=5)
mean_absolute_error(np.exp(y_pred),np.exp(y))

ValueError: Found input variables with inconsistent numbers of samples: [224, 1458]

In [200]:
max(y_pred),min(y_pred)

(285291.9811963433, -1016334.3592030071)