In [1]:
#### Import Section
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import env
import wrangle_zillow
from os.path import exists

from itertools import product
from scipy.stats import levene , pearsonr, spearmanr, mannwhitneyu, f_oneway, ttest_ind
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, f_regression, SelectKBest

import warnings
warnings.filterwarnings("ignore")

# Model
Using scaled dataset

In [2]:
df, train, validate, test, train_scaled, validate_scaled, test_scaled   = wrangle_zillow.wrangle_zillow()

(28985, 21) (12423, 21) (10352, 21)
-----
DataFrame info:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51760 entries, 1727539 to 1187175
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   basementsqft                51760 non-null  float64 
 1   bathrooms                   51760 non-null  float64 
 2   bedrooms                    51760 non-null  float64 
 3   area                        51760 non-null  float64 
 4   county                      51760 non-null  object  
 5   garagecarcnt                51760 non-null  float64 
 6   garagetotalsqft             51760 non-null  float64 
 7   latitude                    51760 non-null  float64 
 8   longitude                   51760 non-null  float64 
 9   lotsizesquarefeet           51760 non-null  float64 
 10  poolcnt                     51760 non-null  float64 
 11  structuretaxvaluedollarcnt  51760 non-null  float64 
 12  home_va

## Modeling Prep

In [101]:
# create X,y for train, validate and test subsets
X_train = train_scaled.drop(columns='logerror')
y_train = train.logerror
X_val = validate_scaled.drop(columns='logerror')
y_val = validate.logerror
X_test = test_scaled.drop(columns='logerror')
y_test = test_scaled.logerror

In [102]:
#shift y subsets into a data frame
y_train = pd.DataFrame(y_train)
y_val = pd.DataFrame(y_val)
y_test = pd.DataFrame(y_test)

In [110]:
#baseline --> choose median because data is scaled

#add baseline columns
y_train['pred_median'] = y_train.logerror.median()
y_val['pred_median'] = y_val.logerror.median()
y_test['pred_median'] = y_test.logerror.median()

#add/calc RMSEs for median baseline predictions
rmse_val = mean_squared_error(y_val.logerror, y_val.pred_median, squared=False)
rmse_train = mean_squared_error(y_train.logerror, y_train.pred_median, squared=False)

print(f'RMSEs: Median\n Train/In Sample: {round(rmse_train, 2)}\n Validate/Out of Sample: {round(rmse_val, 2)}')



RMSEs: Median
 Train/In Sample: 0.18
 Validate/Out of Sample: 0.18


In [103]:
dummy_columns = ['county',
                 'home_size',
                 'aircon',
                 'heating',
                 'logerror_bin',
                 ]

In [104]:
#get dummies for X subsets
X_train = pd.get_dummies(X_train, columns=dummy_columns, drop_first=True)
X_val = pd.get_dummies(X_val, columns=dummy_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=dummy_columns, drop_first=True)

#add column after dummy creation to insure feature count match
X_train.insert(25, 'heating_Gravity', 0)

## Select K Best

In [105]:
def select_kbest(X, y, k): 
    # initilize selector object
    f_selector = SelectKBest(f_regression, k=k)

    #fit object --> will find top 2 as requested
    f_selector.fit(X, y)

    # create mask
    feature_mask = f_selector.get_support()

    # use mask to show list of feature support
    f_top_features = X.iloc[:,feature_mask].columns.tolist()

    return f_top_features

## RFE

In [106]:
def rfe (X, y, n):
    """
    Purpose
        To return the top features selected by the RFE function

    Parameters
       X: dataframe containing X subset of features for the data subset
       y: dataframe with series containing the target variable
       n: the number of features for the function to select 
    Returns
       rfe_tip_features: list of the top features selected by SelectKBest function
    """
    #initialize  regression object
    lm = LinearRegression()

    # initilize RFE object with n features
    rfe = RFE(lm, n_features_to_select=n)

    #fit object onto data
    rfe.fit(X, y)

    #create boolean mask for columns model selects 
    feature_mask = rfe.support_

    # use mask to show list of selected features
    rfe_top_features = X.iloc[:, feature_mask].columns.tolist()

    return rfe_top_features

## Get Features

In [32]:
#set list of up for features
all_features = list(X_train.columns)

req_features = ['area', 'bedrooms', 'bathrooms']

feat_set1  = ['est_tax_rate', 'area']

feat_set2 = ['est_tax_rate', 'area', 'age']

feat_set3 = ['est_tax_rate', 'area', 'age', 'county_Orange County', 'county_Ventura County']

feat_set4 = ['est_tax_rate', 'area', 'bathrooms', 'age', 'county_Orange County', 'county_Ventura County']

feat_rfe = rfe(X_train, y_train.logerror, 4)

feat_sk_best = select_kbest(X_train, y_train.logerror, 4)

feat_combos = [all_features, req_features, feat_set1, feat_set2, feat_set3, feat_set4, feat_sk_best, feat_rfe]

In [111]:
model_descriptions = pd.DataFrame([['pred_median', rmse_train, 0, 'N/A', 'N/A']], columns=['Name','RMSE', 'r^2 score','Features', 'Parameters'])
model_descriptions

Unnamed: 0,Name,RMSE,r^2 score,Features,Parameters
0,pred_median,0.175289,0,,


## PF Mod

In [112]:
def pf_mod(X, y, selectors, fit_train=None, fit_y_train=None):
    """
    Purpose
       to create, train, and score linear regression models using polynomial features
    Parameters
       X: dataframe containing X subset of features for the data subset
       y: dataframe with series containing the target variable
       selectors: list of different feature and degree combinations for use with models
       fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
       fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
    Returns
       pf_description: dataFrame containing the scores, features, and parameters of the created models
    """
    #create empty data frame to hold model descriptions    
    pf_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

    for idx, combo in enumerate(selectors):
        pf = PolynomialFeatures(degree=combo[1])

        lm = LinearRegression(normalize=True)

        if fit_train is not None:
            fit_pf = pf.fit_transform(fit_train[combo[0]])
            X_pf = pf.transform(X[combo[0]])  
            lm.fit(fit_pf, fit_y_train.logerror)
        else:
            X_pf = pf.fit_transform(X[combo[0]])
            lm.fit(X_pf, y.logerror)

        model_label = f'Polynomial_{idx+1}'

        #predict train
        y[model_label] = lm.predict(X_pf) 

        #calculate train rmse
        rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

        # print(f'{model_label} with degree: {combo[1]} \n\
        #     Features: {combo[0]} \n\
        #     RMSE: {rmse}\n')
        
        description = pd.DataFrame([[model_label, rmse, combo[0], f'Degree: {combo[1]}']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
        pf_descriptions = pd.concat([pf_descriptions, description])

    return pf_descriptions

In [113]:
#create a list of parameters
pf_parameters = [2,3]

#use list with product to create tuples of feature/parameter combination to feed into model
selectors = list(product(feat_combos, pf_parameters))

#call pf function to get predictions for the Polynomial models and add to the dataframe
pf_descriptions = pf_mod(X_train, y_train, selectors)

In [114]:
pf_descriptions

Unnamed: 0,Name,RMSE,Features,Parameters
0,Polynomial_1,0.122787,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 2
0,Polynomial_2,0.110753,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 3
0,Polynomial_3,0.174689,"[area, bedrooms, bathrooms]",Degree: 2
0,Polynomial_4,0.174609,"[area, bedrooms, bathrooms]",Degree: 3
0,Polynomial_5,0.174722,"[est_tax_rate, area]",Degree: 2
0,Polynomial_6,0.174654,"[est_tax_rate, area]",Degree: 3
0,Polynomial_7,0.174601,"[est_tax_rate, area, age]",Degree: 2
0,Polynomial_8,0.174475,"[est_tax_rate, area, age]",Degree: 3
0,Polynomial_9,0.174518,"[est_tax_rate, area, age, county_Orange County...",Degree: 2
0,Polynomial_10,0.174338,"[est_tax_rate, area, age, county_Orange County...",Degree: 3


In [115]:
def ols_mod(X, y, selectors, fit_x_train=None, fit_y_train=None):
    """
    Purpose
       to create, train, and score ordinary least squares linear regression modelss
    Parameters
       X: dataframe containing X subset of features for the data subset
       y: dataframe with series containing the target variable
       selectors: list of different feature and degree combinations for use with models
       fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
       fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
    Returns
       pf_description: dataFrame containing the scores, features, and parameters of the created models
    """
    #create empty data frame to hold model descriptions    
    ols_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

    #loop through selector combinations to pull out different features and degree levels
    for idx, features in enumerate(selectors):  
        #create model object
        lm = LinearRegression()
        #create mdoel label
        model_label = f'OLS_{idx+1}'
        
        #fit object on X_train subset depeneding on its position as parameter or the optional variant
        if fit_x_train is not None:
            lm.fit(fit_x_train[features], fit_y_train.logerror)
        else:   
            lm.fit(X[features], y.logerror)

        #predict train
        y[model_label] = lm.predict(X[features]) 

        #calc trian rmse
        rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

        description = pd.DataFrame([[model_label, rmse, features, 'N/A']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
        ols_descriptions = pd.concat([ols_descriptions, description])

    return ols_descriptions

In [116]:
#run ols model with feature combinations
olf_descriptions = ols_mod(X_train, y_train, feat_combos)

In [36]:
def lars_mod(X, y, selectors, fit_x_train=None, fit_y_train=None):
   """
   Purpose
      to create, train, and score linear regression models using polynomial features
   Parameters
      X: dataframe containing X subset of features for the data subset
      y: dataframe with series containing the target variable
      selectors: list of different feature and degree combinations for use with models
      fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
      fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
   Returns
      pf_description: dataFrame containing the scores, features, and parameters of the created models
   """

   #create empty data frame to hold model descriptions    
   lars_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

   #loop through selector combinations to pull out different features and degree levels
   for idx, selector in enumerate(selectors):  
      #create model object
      lars = LassoLars(alpha=selector)
      #create mdoel label
      model_label = f'LARS_{idx+1}'

   #   #fit mode 
   #   lars.fit(X, y.logerror)
      #fit object on X_train subset depeneding on its position as parameter or the optional variant
      if fit_x_train is not None:
         lars.fit(fit_x_train, fit_y_train.logerror)
      else:   
         lars.fit(X, y.logerror)

      #predict train
      y[model_label] = lars.predict(X) 

      #calc trian rmse
      rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

      description = pd.DataFrame([[model_label, rmse, 'all', f'Alpha: {selector}']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
      lars_descriptions = pd.concat([lars_descriptions, description])

   return lars_descriptions


In [118]:
#create a list of parameters
lars_parameters = [.25, .5, .75, 1]

lars_descriptions = lars_mod(X_train, y_train, lars_parameters)

In [22]:
def GLM_mod(X, y, selectors):
   """
   Purpose
      to create, train, and score linear regression models using polynomial features
   Parameters
      X: dataframe containing X subset of features for the data subset
      y: dataframe with series containing the target variable
      selectors: list of different feature and degree combinations for use with models
      fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
      fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
   Returns
      pf_description: dataFrame containing the scores, features, and parameters of the created models
   """
   
   #create empty data frame to hold model descriptions    
   glm_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

   #create empty data frame to hold model descriptions    
   for idx, combo in enumerate(selectors):  
      #create model object
      glm = TweedieRegressor(power=combo[0], alpha=combo[1])

      #create model label
      model_label = f'GLM_{idx+1}'

      #fit mode 
      glm.fit(X, y.logerror)

      #predict train
      y[model_label] = glm.predict(X) 

      #calc rmse
      rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

      description = pd.DataFrame([[model_label, rmse, '-', f'Power,Alpha: {combo}']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
      glm_descriptions = pd.concat([glm_descriptions, description])

   return glm_descriptions 


In [28]:
#create a list of parameters
glm_parameters = [(0,0), (0,.25), (0,.5), (0,.75), (0,1)]

glm_descriptions = GLM_mod(X_train, y_train, glm_parameters)

In [29]:
glm_descriptions

Unnamed: 0,Name,RMSE,Features,Parameters
0,GLM_1,0.126061,-,"Power,Alpha: (0, 0)"
0,GLM_2,0.1652,-,"Power,Alpha: (0, 0.25)"
0,GLM_3,0.169159,-,"Power,Alpha: (0, 0.5)"
0,GLM_4,0.170806,-,"Power,Alpha: (0, 0.75)"
0,GLM_5,0.171713,-,"Power,Alpha: (0, 1)"


In [30]:
round(y_train, 4)

Unnamed: 0_level_0,logerror,pred_median,Polynomial_1,Polynomial_2,Polynomial_3,Polynomial_4,Polynomial_5,Polynomial_6,Polynomial_7,Polynomial_8,...,OLS_8,LARS_1,LARS_2,LARS_3,LARS_4,GLM_1,GLM_2,GLM_3,GLM_4,GLM_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
874932,0.0232,0.0067,0.0094,0.0256,0.0179,0.0178,0.0149,0.0142,0.0170,0.0204,...,0.0046,0.0188,0.0188,0.0188,0.0188,0.0056,0.0103,0.0129,0.0142,0.0149
912277,-0.0706,0.0067,0.0086,0.0083,0.0146,0.0160,0.0134,0.0137,0.0159,0.0163,...,0.0033,0.0188,0.0188,0.0188,0.0188,0.0024,0.0105,0.0138,0.0153,0.0160
1731609,-0.0091,0.0067,0.0205,-0.0200,0.0406,0.0460,0.0349,0.0378,0.0379,0.0353,...,0.0261,0.0188,0.0188,0.0188,0.0188,0.0086,0.0115,0.0142,0.0155,0.0161
2684943,0.0536,0.0067,0.0223,0.0051,0.0231,0.0222,0.0218,0.0203,0.0246,0.0247,...,0.0112,0.0188,0.0188,0.0188,0.0188,0.0152,0.0203,0.0204,0.0204,0.0203
134804,0.0167,0.0067,0.0229,0.0126,0.0170,0.0145,0.0185,0.0170,0.0187,0.0190,...,0.0081,0.0188,0.0188,0.0188,0.0188,0.0024,0.0140,0.0160,0.0169,0.0173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124366,-0.0418,0.0067,0.0080,0.0505,0.0112,0.0125,0.0138,0.0154,0.0070,0.0060,...,0.0037,0.0188,0.0188,0.0188,0.0188,0.0243,0.0154,0.0174,0.0181,0.0184
2225558,-0.0170,0.0067,0.0066,0.0095,0.0164,0.0131,0.0231,0.0194,0.0197,0.0171,...,0.0120,0.0188,0.0188,0.0188,0.0188,0.0063,0.0097,0.0125,0.0139,0.0147
982193,0.0075,0.0067,-0.0044,-0.0096,0.0249,0.0261,0.0253,0.0268,0.0276,0.0301,...,0.0160,0.0188,0.0188,0.0188,0.0188,0.0098,0.0204,0.0209,0.0210,0.0208
2007739,0.0153,0.0067,0.0089,0.0235,0.0084,0.0057,0.0187,0.0172,0.0179,0.0153,...,0.0090,0.0188,0.0188,0.0188,0.0188,0.0087,0.0102,0.0130,0.0143,0.0152


## Score

In [123]:
model_descriptions = pd.DataFrame([['pred_median', rmse_train, 0, 'N/A', 'N/A']], columns=['Name','RMSE', 'r^2 score','Features', 'Parameters'])

#create df for model scores on the train scores
model_scores = pd.DataFrame({}, columns=['Model', 'r^2 score'])
model_scores = model_scores.set_index('Model')
model_descriptions = pd.concat([model_descriptions, pf_descriptions, lars_descriptions, olf_descriptions, glm_descriptions])
model_descriptions = model_descriptions.set_index('Name')
for idx, model in enumerate(y_train.drop(columns='logerror').columns):
    model_descriptions.loc[model, 'r^2 score'] = explained_variance_score(y_train['logerror'], y_train[model])

round(model_descriptions,2)

Unnamed: 0_level_0,RMSE,r^2 score,Features,Parameters
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pred_median,0.175289,0.0,,
Polynomial_1,0.122787,0.51,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 2
Polynomial_2,0.110753,0.6,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 3
Polynomial_3,0.174689,0.0,"[area, bedrooms, bathrooms]",Degree: 2
Polynomial_4,0.174609,0.0,"[area, bedrooms, bathrooms]",Degree: 3
Polynomial_5,0.174722,0.0,"[est_tax_rate, area]",Degree: 2
Polynomial_6,0.174654,0.0,"[est_tax_rate, area]",Degree: 3
Polynomial_7,0.174601,0.0,"[est_tax_rate, area, age]",Degree: 2
Polynomial_8,0.174475,0.0,"[est_tax_rate, area, age]",Degree: 3
Polynomial_9,0.174518,0.0,"[est_tax_rate, area, age, county_Orange County...",Degree: 2


# Validate
* Best 10 models:


In [124]:
round(model_descriptions.sort_values(by='RMSE').head(10),2)

Unnamed: 0_level_0,RMSE,r^2 score,Features,Parameters
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Polynomial_2,0.110753,0.6,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 3
Polynomial_1,0.122787,0.51,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 2
Polynomial_14,0.125847,0.48,"[bedrooms, area, logerror_bin_-1sig~1sig, loge...",Degree: 3
Polynomial_16,0.125916,0.48,"[area, structuretaxvaluedollarcnt, logerror_bi...",Degree: 3
OLS_1,0.125991,0.48,"[basementsqft, bathrooms, bedrooms, area, gara...",
Polynomial_15,0.126013,0.48,"[area, structuretaxvaluedollarcnt, logerror_bi...",Degree: 2
Polynomial_13,0.126097,0.48,"[bedrooms, area, logerror_bin_-1sig~1sig, loge...",Degree: 2
OLS_8,0.126147,0.48,"[area, structuretaxvaluedollarcnt, logerror_bi...",
OLS_7,0.126194,0.48,"[bedrooms, area, logerror_bin_-1sig~1sig, loge...",
GLM_2,0.1652,0.11,-,"Power,Alpha: (0, 0.25)"


In [37]:
# create list of selectors for validate
val_selectors = [(all_features, 3),(all_features, 2), (feat_set2, 3), (feat_set3, 3), (feat_set4, 3)]

#call pf function to get predictions for the Polynomial models and add to the dataframe
pf_mod(X_val, y_val, val_selectors, X_train, y_train)

#create a list of parameters
val_parameters = [.25, .5]

lars_descriptions = lars_mod(X_train, y_train, val_parameters, X_val, y_val)

validate_scores = pd.DataFrame({}, columns=['Model', 'r^2 Score'])

for idx, model in enumerate(y_val.drop(columns='logerror').columns):
    score = explained_variance_score(y_val['logerror'], y_val[model])
    validate_scores.loc[idx] = [model, score]

round(validate_scores,4)


Unnamed: 0,Model,r^2 Score
0,pred_median,0.0
1,Polynomial_1,-1.647291e+24
2,Polynomial_2,-2.656524e+22
3,Polynomial_3,0.0027
4,Polynomial_4,0.0019
5,Polynomial_5,-0.0029


In [50]:
for item in y_val.drop(columns='logerror').columns:
    print(f'RMSE {mean_squared_error(y_val.logerror, y_val[item], squared=False)}')

RMSE 0.18477967963789746
RMSE 236801873114.9983
RMSE 30073399457.109917
RMSE 0.18424549842717014
RMSE 0.18431841473114904
RMSE 0.18476710557468787


In [47]:
validate_scores

Unnamed: 0,Model,r^2 Score,RMSE
0,pred_median,0.0,RMSE 0.18476710557468787
1,Polynomial_1,-1.647291e+24,RMSE 0.18476710557468787
2,Polynomial_2,-2.656524e+22,RMSE 0.18476710557468787
3,Polynomial_3,0.002701429,RMSE 0.18476710557468787
4,Polynomial_4,0.001919165,RMSE 0.18476710557468787
5,Polynomial_5,-0.002937567,RMSE 0.18476710557468787


# Test
* Polynomial 5

In [129]:
test_selectors = [(feat_set4, 3)]

pf_mod(X_test, y_test, test_selectors, X_train, y_train)

Unnamed: 0,Name,RMSE,Features,Parameters
0,Polynomial_1,0.567746,"[est_tax_rate, area, bathrooms, age, county_Or...",Degree: 3


In [130]:
test_scores = pd.DataFrame({}, columns=['Model', 'r^2 Score'])

for idx, model in enumerate(y_test.drop(columns='logerror').columns):
    score = explained_variance_score(y_test['logerror'], y_test[model])
    test_scores.loc[idx] = [model, score]

test_score = explained_variance_score(y_test['logerror'], y_test[model])


In [131]:
test_score

-0.5330382037939192

# Functions For Export

## Modeling Prep

In [4]:
def modeling_prep (train, train_scaled, validate, validate_scaled, test, test_scaled):
   """
   Purpose
      To return X, y subsets for training, validation, and testing of models

   Parameters
      train/validate/test: dataframes containing appropriate subsets of data
      train_scaled, validate_scaled, test_scaled: dataframes contianing scaled versions of approrpriate subsets of data

   Returns
      X_train, y_train, X_val, y_val, X_test, y_test: dataframes containing appropriate subsets of data
   """
   # create X,y for train, validate and test subsets
   X_train = train_scaled.drop(columns='logerror')
   y_train = train.logerror
   X_val = validate_scaled.drop(columns='logerror')
   y_val = validate.logerror
   X_test = test_scaled.drop(columns='logerror')
   y_test = test_scaled.logerror

   #shift y subsets into a data frame
   y_train = pd.DataFrame(y_train)
   y_val = pd.DataFrame(y_val)
   y_test = pd.DataFrame(y_test)

   #add baseline predictions
   y_train['pred_median'] = y_train.logerror.median()
   y_val['pred_median'] = y_val.logerror.median()
   y_test['pred_median'] = y_test.logerror.median()

   #get dummies for X subsets
   dummy_columns = ['county',
               'home_size',
               'aircon',
               'heating',
               'logerror_bin',
               ]
   X_train = pd.get_dummies(X_train, columns=dummy_columns, drop_first=True)
   X_val = pd.get_dummies(X_val, columns=dummy_columns, drop_first=True)
   X_test = pd.get_dummies(X_test, columns=dummy_columns, drop_first=True)

   #add column after dummy creation to insure feature count match
   X_train.insert(25, 'heating_Gravity', 0)

   return X_train, y_train, X_val, y_val, X_test, y_test

## Select K Best

In [5]:
def select_kbest(X, y, k): 
    """
    Purpose
        To return the top features selecting by the SelectKBest function

    Parameters
       X: dataframe containing X subset of features for the data subset
       y: dataframe with series containing the target variable
       k: the number of features for the function to suggest 
    Returns
       f_top_features: list of the top features selected by SelectKBest function
    """
    # initilize selector object
    f_selector = SelectKBest(f_regression, k=k)

    #fit object --> will find top 2 as requested
    f_selector.fit(X, y)

    # create mask
    feature_mask = f_selector.get_support()

    # use mask to show list of feature support
    f_top_features = X.iloc[:,feature_mask].columns.tolist()

    return f_top_features

## RFE

In [6]:
def rfe (X, y, n):
    """
    Purpose
        To return the top features selected by the RFE function

    Parameters
       X: dataframe containing X subset of features for the data subset
       y: dataframe with series containing the target variable
       n: the number of features for the function to select 
    Returns
       rfe_tip_features: list of the top features selected by SelectKBest function
    """
    #initialize  regression object
    lm = LinearRegression()

    # initilize RFE object with n features
    rfe = RFE(lm, n_features_to_select=n)

    #fit object onto data
    rfe.fit(X, y)

    #create boolean mask for columns model selects 
    feature_mask = rfe.support_

    # use mask to show list of selected features
    rfe_top_features = X.iloc[:, feature_mask].columns.tolist()

    return rfe_top_features


## Get Features

In [7]:
def get_features(X_train, y_train):
    """
    Purpose
        create a list of feature combinations to feed into the various models

    Parameters
       X_train: dataframe containing X subset of features for the data subset
       y_train: dataframe with series containing the target variable
    Returns
       feat_combos: list feature combinations
    """
    #create lists of features
    all_features = list(X_train.columns)
    req_features = ['area', 'bedrooms', 'bathrooms']
    feat_set1  = ['est_tax_rate', 'area']
    feat_set2 = ['est_tax_rate', 'area', 'age']
    feat_set3 = ['est_tax_rate', 'area', 'age', 'county_Orange County', 'county_Ventura County']
    feat_set4 = ['est_tax_rate', 'area', 'bathrooms', 'age', 'county_Orange County', 'county_Ventura County']
    feat_rfe = rfe(X_train, y_train.logerror, 4)
    feat_sk_best = select_kbest(X_train, y_train.logerror, 4)

    #combine lists of features into large list feature all selected combinations
    feat_combos = [all_features, req_features, feat_set1, feat_set2, feat_set3, feat_set4, feat_sk_best, feat_rfe]

    return feat_combos

## PF Mod

In [8]:
def pf_mod(X, y, selectors, fit_train=None, fit_y_train=None):
    """
    Purpose
       to create, train, and score linear regression models using polynomial features
    Parameters
       X: dataframe containing X subset of features for the data subset
       y: dataframe with series containing the target variable
       selectors: list of different feature and degree combinations for use with models
       fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
       fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
    Returns
       pf_description: dataFrame containing the scores, features, and parameters of the created models
    """

    #create empty data frame to hold model descriptions    
    pf_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

    #loop through selector combinations to pull out different features and degree levels
    for idx, combo in enumerate(selectors):
        #create features object
        pf = PolynomialFeatures(degree=combo[1])
        #initialize model object
        lm = LinearRegression(normalize=True)
        #fit object on X_train subset depeneding on its position as parameter or the optional variant
        if fit_train is not None:
            fit_pf = pf.fit_transform(fit_train[combo[0]])
            X_pf = pf.transform(X[combo[0]])  
            lm.fit(fit_pf, fit_y_train.logerror)
        else:
            X_pf = pf.fit_transform(X[combo[0]])
            lm.fit(X_pf, y.logerror)

        model_label = f'Polynomial_{idx+1}'

        #predict train
        y[model_label] = lm.predict(X_pf) 

        #calculate train rmse
        rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

     
        description = pd.DataFrame([[model_label, rmse, combo[0], f'Degree: {combo[1]}']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
        pf_descriptions = pd.concat([pf_descriptions, description])

    return pf_descriptions

## OLS Mod

In [9]:
def ols_mod(X, y, selectors, fit_x_train=None, fit_y_train=None):
    """
    Purpose
       to create, train, and score ordinary least squares linear regression modelss
    Parameters
       X: dataframe containing X subset of features for the data subset
       y: dataframe with series containing the target variable
       selectors: list of different feature and degree combinations for use with models
       fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
       fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
    Returns
       pf_description: dataFrame containing the scores, features, and parameters of the created models
    """
    #create empty data frame to hold model descriptions    
    ols_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

    #loop through selector combinations to pull out different features and degree levels
    for idx, features in enumerate(selectors):  
        #create model object
        lm = LinearRegression()
        #create mdoel label
        model_label = f'OLS_{idx+1}'
        
        #fit object on X_train subset depeneding on its position as parameter or the optional variant
        if fit_x_train is not None:
            lm.fit(fit_x_train[features], fit_y_train.logerror)
        else:   
            lm.fit(X[features], y.logerror)

        #predict train
        y[model_label] = lm.predict(X[features]) 

        #calc trian rmse
        rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

        description = pd.DataFrame([[model_label, rmse, features, 'N/A']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
        ols_descriptions = pd.concat([ols_descriptions, description])

    return ols_descriptions

## Lars Mod

In [35]:
def lars_mod(X, y, selectors, fit_x_train=None, fit_y_train=None):
   """
   Purpose
      to create, train, and score linear regression models using polynomial features
   Parameters
      X: dataframe containing X subset of features for the data subset
      y: dataframe with series containing the target variable
      selectors: list of different feature and degree combinations for use with models
      fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
      fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
   Returns
      pf_description: dataFrame containing the scores, features, and parameters of the created models
   """

   #create empty data frame to hold model descriptions    
   lars_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

   #loop through selector combinations to pull out different features and degree levels
   for idx, selector in enumerate(selectors):  
      #create model object
      lars = LassoLars(alpha=selector)
      #create mdoel label
      model_label = f'LARS_{idx+1}'

      #fit mode 
      lars.fit(X, y.logerror)
      #fit object on X_train subset depeneding on its position as parameter or the optional variant
      if fit_x_train is not None:
         lars.fit(fit_x_train, fit_y_train.logerror)
      else:   
         lars.fit(X, y.logerror)

      #predict train
      y[model_label] = lars.predict(X) 

      #calc trian rmse
      rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

      description = pd.DataFrame([[model_label, rmse, 'all', f'Alpha: {selector}']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
      lars_descriptions = pd.concat([lars_descriptions, description])

   return lars_descriptions



In [16]:
def GLM_mod(X, y, selectors):
   """
   Purpose
      to create, train, and score linear regression models using polynomial features
   Parameters
      X: dataframe containing X subset of features for the data subset
      y: dataframe with series containing the target variable
      selectors: list of different feature and degree combinations for use with models
      fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
      fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
   Returns
      pf_description: dataFrame containing the scores, features, and parameters of the created models
   """
   
   #create empty data frame to hold model descriptions    
   glm_descriptions = pd.DataFrame({}, columns=['Name','RMSE', 'Features', 'Parameters'])

   #create empty data frame to hold model descriptions    
   for idx, combo in enumerate(selectors):  
      #create model object
      glm = TweedieRegressor(power=combo[0], alpha=combo[1])

      #create model label
      model_label = f'GLM_{idx+1}'

      #fit mode 
      glm.fit(X, y.logerror)

      #predict train
      y[model_label] = glm.predict(X) 

      #calc rmse
      rmse = mean_squared_error(y.logerror, y[model_label], squared=False)

      description = pd.DataFrame([[model_label, rmse, '-', f'Power,Alpha: {combo}']], columns=['Name', 'RMSE', 'Features', 'Parameters'])
      glm_descriptions = pd.concat([glm_descriptions, description])

   return glm_descriptions 

In [10]:
X_train, y_train, X_val, y_val, X_test, y_test = modeling_prep(train, train_scaled, validate, validate_scaled, test, test_scaled)

## Train and Score

In [18]:
def train_score(X_train, y_train): 
   """
   Purpose
      to create, train, and score linear regression models using polynomial features
   Parameters
      X: dataframe containing X subset of features for the data subset
      y: dataframe with series containing the target variable
      selectors: list of different feature and degree combinations for use with models
      fit_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
      fit_y_train: optional, X_train subset of data to fit the model when needing to score validation or test subsets
   Returns
      pf_description: dataFrame containing the scores, features, and parameters of the created models
   """

   #create lists of features
   feat_combos = get_features(X_train, y_train)

   #create a lists of parameters
   pf_parameters = [2,3]
   lars_parameters = [.25, .5, .75, 1]
   glm_parameters = [(0,0), (0,.25), (0,.5), (0,.75), (0,1)]

   #use list with product to create tuples of feature/parameter combination to feed into model
   pf_selectors = list(product(feat_combos, pf_parameters))

   #run ols model with feature combinations
   pf_descriptions = pf_mod(X_train, y_train, pf_selectors)
   olf_descriptions = ols_mod(X_train, y_train, feat_combos)
   lars_descriptions = lars_mod(X_train, y_train, lars_parameters)
   glm_descriptions = GLM_mod(X_train, y_train, glm_parameters)

   #calc rmse
   rmse = mean_squared_error(y_train.logerror, y_train.pred_median, squared=False)

   #create empty dataframe to hold model descriptions
   model_descriptions = pd.DataFrame([['pred_median', rmse, 0, 'N/A', 'N/A']], columns=['Name','RMSE', 'r^2 score','Features', 'Parameters'])

   #create df for model scores on the train scores
   model_scores = pd.DataFrame({}, columns=['Model', 'r^2 score'])
   model_scores = model_scores.set_index('Model')
   model_descriptions = pd.concat([model_descriptions, pf_descriptions, lars_descriptions, olf_descriptions, glm_descriptions])
   model_descriptions = model_descriptions.set_index('Name')
   for idx, model in enumerate(y_train.drop(columns='logerror').columns):
      model_descriptions.loc[model, 'r^2 score'] = explained_variance_score(y_train['logerror'], y_train[model])

   return round(model_descriptions,2)

In [19]:
train_score(X_train, y_train)

Unnamed: 0_level_0,RMSE,r^2 score,Features,Parameters
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pred_median,0.175289,0.0,,
Polynomial_1,0.122787,0.51,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 2
Polynomial_2,0.110753,0.6,"[basementsqft, bathrooms, bedrooms, area, gara...",Degree: 3
Polynomial_3,0.174689,0.0,"[area, bedrooms, bathrooms]",Degree: 2
Polynomial_4,0.174609,0.0,"[area, bedrooms, bathrooms]",Degree: 3
Polynomial_5,0.174722,0.0,"[est_tax_rate, area]",Degree: 2
Polynomial_6,0.174654,0.0,"[est_tax_rate, area]",Degree: 3
Polynomial_7,0.174601,0.0,"[est_tax_rate, area, age]",Degree: 2
Polynomial_8,0.174475,0.0,"[est_tax_rate, area, age]",Degree: 3
Polynomial_9,0.174518,0.0,"[est_tax_rate, area, age, county_Orange County...",Degree: 2
