In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PolynomialFeatures, Normalizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the data

In [2]:
data = pd.read_csv("./train.csv")

In [3]:
data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Data Analysis

Checking for missing data and dropping 'Id' column since its just an index column

In [4]:
data.drop(columns=["Id"], inplace=True)

In [5]:
def get_missing_percentages(df):
  missing = df.isnull().sum() * 100 / len(df)
  return missing[missing > 0].sort_values(ascending=False)

In [6]:
missing_percentages = get_missing_percentages(data)

In [7]:
missing_percentages[missing_percentages > 10]

PoolQC         99.520548
MiscFeature    96.301370
Alley          93.767123
Fence          80.753425
FireplaceQu    47.260274
LotFrontage    17.739726
dtype: float64

In [8]:
data[missing_percentages[missing_percentages > 10].index]

Unnamed: 0,PoolQC,MiscFeature,Alley,Fence,FireplaceQu,LotFrontage
0,,,,,,65.0
1,,,,,TA,80.0
2,,,,,TA,68.0
3,,,,,Gd,60.0
4,,,,,TA,84.0
...,...,...,...,...,...,...
1455,,,,,TA,62.0
1456,,,,MnPrv,TA,85.0
1457,,Shed,,GdPrv,Gd,66.0
1458,,,,,,68.0


After reading the feature descriptions for each feature here, we can see that all features but 'LotFrontage' includes NaN as an actual value that represents something, so instead of deleting them, we'll just fill them with some string so they don't count and missing values

In [9]:
column_nan_has_value = missing_percentages[missing_percentages > 10].index[:-1]
column_nan_has_value

Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], dtype='object')

In [10]:
def fill_columns_with_nan_value(df):
  return df[column_nan_has_value].fillna("None")

In [11]:
data[column_nan_has_value] = fill_columns_with_nan_value(data)
data[column_nan_has_value]

Unnamed: 0,PoolQC,MiscFeature,Alley,Fence,FireplaceQu
0,,,,,
1,,,,,TA
2,,,,,TA
3,,,,,Gd
4,,,,,TA
...,...,...,...,...,...
1455,,,,,TA
1456,,,,MnPrv,TA
1457,,Shed,,GdPrv,Gd
1458,,,,,


The last missing feature with "high" missing percentage is 'LotFrontage', its missing percentage is not too high, so we'll just fill the missing values with the mean of the feature

In [12]:
data['LotFrontage'] = data['LotFrontage'].fillna(data['LotFrontage'].mean())

### Now taking a look at features with "low" missing percentages
we can see that there's groups forming in these features, meaning the same rows missing the same columns, so lets get unique values and see our missing percentage

In [13]:
missing_percentages[missing_percentages < 10]

GarageType      5.547945
GarageYrBlt     5.547945
GarageFinish    5.547945
GarageQual      5.547945
GarageCond      5.547945
BsmtExposure    2.602740
BsmtFinType2    2.602740
BsmtFinType1    2.534247
BsmtCond        2.534247
BsmtQual        2.534247
MasVnrArea      0.547945
MasVnrType      0.547945
Electrical      0.068493
dtype: float64

Now we can see our distinct groups, they don't seem to be a lot so we'll just delete the rows with missing data

In [14]:
missing_percentages[missing_percentages < 10].unique()

array([5.54794521, 2.60273973, 2.53424658, 0.54794521, 0.06849315])

In [15]:
def delete_missing_less_than(df, threshhold):
  percentages = get_missing_percentages(df)
  for column in percentages[percentages < threshhold].index:
    df = df[~df[column].isna()]
    df.reset_index(drop=True, inplace=True)
  return df

In [16]:
data = delete_missing_less_than(data, 10)
data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1334,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1335,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1336,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


## Dealing with skewed data
we'll set our threshold to count a feature as "skewed" to 0.5

In [17]:
skewed_data = data.skew(numeric_only=True)
skewed_above_half = skewed_data[skewed_data > 0.5]
skewed_above_half

MSSubClass        1.391901
LotFrontage       2.479517
LotArea          11.938124
OverallCond       0.897541
MasVnrArea        2.582685
BsmtFinSF1        1.693397
BsmtFinSF2        4.146519
BsmtUnfSF         0.935363
TotalBsmtSF       2.214029
1stFlrSF          1.397431
2ndFlrSF          0.769147
LowQualFinSF     10.566815
GrLivArea         1.430307
BsmtHalfBath      3.847909
HalfBath          0.554013
KitchenAbvGr      5.943561
TotRmsAbvGrd      0.673364
Fireplaces        0.570838
GarageArea        0.807078
WoodDeckSF        1.481740
OpenPorchSF       2.250952
EnclosedPorch     3.205286
3SsnPorch        10.096553
ScreenPorch       3.916848
PoolArea         14.187832
MiscVal          24.632578
SalePrice         1.943686
dtype: float64

getting the skewed columns to "unskew" them

In [18]:
columns_to_log_transform = skewed_above_half.index
columns_to_log_transform

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtHalfBath', 'HalfBath',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice'],
      dtype='object')

Applying Log(x + 1) to the missing features

In [19]:
for column in columns_to_log_transform:
  data[column] = np.log1p(data[column])

In [20]:
data[columns_to_log_transform]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallCond,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,Fireplaces,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SalePrice
0,4.110874,4.189655,9.042040,1.791759,5.283204,6.561031,0.000000,5.017280,6.753438,6.753438,...,0.000000,6.308098,0.000000,4.127134,0.000000,0.0,0.0,0.0,0.000000,12.247699
1,3.044522,4.394449,9.169623,2.197225,0.000000,6.886532,0.000000,5.652489,7.141245,7.141245,...,0.693147,6.133398,5.700444,0.000000,0.000000,0.0,0.0,0.0,0.000000,12.109016
2,4.110874,4.234107,9.328212,1.791759,5.093750,6.188264,0.000000,6.075346,6.825460,6.825460,...,0.693147,6.411818,0.000000,3.761200,0.000000,0.0,0.0,0.0,0.000000,12.317171
3,4.262680,4.110874,9.164401,1.791759,0.000000,5.379897,0.000000,6.293419,6.629363,6.869014,...,0.693147,6.466145,0.000000,3.583519,5.609472,0.0,0.0,0.0,0.000000,11.849405
4,4.110874,4.442651,9.565284,1.791759,5.860786,6.486161,0.000000,6.196444,7.044033,7.044033,...,0.693147,6.729824,5.262690,4.442651,0.000000,0.0,0.0,0.0,0.000000,12.429220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,4.110874,4.143135,8.976894,1.791759,0.000000,0.000000,0.000000,6.860664,6.860664,6.860664,...,0.693147,6.133398,0.000000,3.713572,0.000000,0.0,0.0,0.0,0.000000,12.072547
1334,3.044522,4.454347,9.486152,1.945910,4.787492,6.673298,5.099866,6.380123,7.341484,7.637234,...,1.098612,6.216606,5.857933,0.000000,0.000000,0.0,0.0,0.0,0.000000,12.254868
1335,4.262680,4.204693,9.109746,2.302585,0.000000,5.620401,0.000000,6.777647,7.050123,7.080868,...,1.098612,5.533389,0.000000,4.110874,0.000000,0.0,0.0,0.0,7.824446,12.493133
1336,3.044522,4.234107,9.181735,1.945910,0.000000,3.912023,6.937314,0.000000,6.983790,6.983790,...,0.000000,5.484797,5.905362,0.000000,4.727388,0.0,0.0,0.0,0.000000,11.864469


## Dealing with categorical data

Getting all categorical data

In [21]:
cat_columns = data.select_dtypes(include='object').columns
cat_columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

Defining our ordinal features

In [22]:
ordinal_columns = ['LotShape', 'LandContour', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',\
  'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual',\
  'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'Electrical']

Getting the rest of the features as nominal features

In [23]:
nominal_columns = cat_columns[~np.isin(cat_columns, ordinal_columns)]
nominal_columns

Index(['MSZoning', 'Street', 'Alley', 'Utilities', 'LotConfig', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation',
       'Heating', 'CentralAir', 'GarageType', 'MiscFeature', 'SaleType',
       'SaleCondition'],
      dtype='object')

Saving all possible values for each nominal feature, this is done because some features in the training dataset don't have all the possible values they can get, so we need to supply them manually

In [24]:
condition_values = ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe']
exterior_covering_values = ['AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other',\
  'Plywood', 'PreCase', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng', 'WdShing', 'BrkComm']
unique_nominal_values = [
  ['A', 'C (all)', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'],
  ['Grvl', 'Pave'],
  ['Grvl', 'Pave', 'None'],
  ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],
  ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'],
  ['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',\
    'NAmes', 'NoRidge', 'NPkVill', 'NridgHt', 'NWAmes', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'],
  condition_values,
  condition_values,
  ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
  ['1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl'],
  ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed'],
  ['ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl'],
  exterior_covering_values,
  exterior_covering_values,
  ['BrkCmn', 'BrkFace', 'CBlock', 'None', 'Stone'],
  ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'],
  ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'],
  ['Y', 'N'],
  ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'None'],
  ['Elev', 'Gar2', 'Othr', 'Shed', 'TenC', 'None'],
  ['WD', 'CWD', 'VWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI','ConLD', 'Oth'],
  ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial']
]

Saving all possible values for ordinal features, same reason as above

In [25]:
quality_values = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
quality_values_nan = [*quality_values, 'None']
base_fin_type_values = ['ALQ', 'GLQ', 'Unf', 'BLQ', 'LwQ', 'Rec', 'None']

unique_ordinal_values = [
  ['Reg', 'IR1', 'IR2', 'IR3'],
  ['Lvl', 'Bnk', 'HLS', 'Low'],
  ['Gtl', 'Mod', 'Sev'],
  quality_values,
  quality_values,
  quality_values_nan,
  quality_values_nan,
  ['Gd', 'Av', 'Mn', 'No', 'None'],
  base_fin_type_values,
  base_fin_type_values,
  quality_values,
  quality_values,
  ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'],
  quality_values_nan,
  ['Fin', 'RFn', 'Unf', 'None'],
  quality_values_nan,
  quality_values_nan,
  ['Y', 'P', 'N'],
  quality_values_nan,
  ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'None'],
  ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix']
]

Fitting a one hot encoder on the nominal features

In [26]:
one_hot_encoder = OneHotEncoder(dtype=np.int16, categories=unique_nominal_values)
one_hot_encoder.fit(data[nominal_columns])

OneHotEncoder(categories=[['A', 'C (all)', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'],
                          ['Grvl', 'Pave'], ['Grvl', 'Pave', 'None'],
                          ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],
                          ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'],
                          ['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr',
                           'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR',
                           'MeadowV', 'Mitchel', 'NAmes', 'NoRidge', 'NPkVill',
                           'NridgHt', 'NWAmes...
                          ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone',
                           'Wood'],
                          ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'],
                          ['Y', 'N'],
                          ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort',
                           'Detchd', 'None'],
                          ['Elev', 'Gar2', 'Othr', 'Shed', 'TenC', 'None

Fitting an ordinal encoder on the ordinal features

In [27]:
ordinal_encoder = OrdinalEncoder(dtype=np.int16, categories=unique_ordinal_values)
ordinal_encoder.fit(data[ordinal_columns])

OrdinalEncoder(categories=[['Reg', 'IR1', 'IR2', 'IR3'],
                           ['Lvl', 'Bnk', 'HLS', 'Low'], ['Gtl', 'Mod', 'Sev'],
                           ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
                           ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
                           ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None'],
                           ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None'],
                           ['Gd', 'Av', 'Mn', 'No', 'None'],
                           ['ALQ', 'GLQ', 'Unf', 'BLQ', 'LwQ', 'Rec', 'None'],
                           ['ALQ', 'GLQ', 'Unf', 'BLQ', 'LwQ', 'Rec', 'None'],
                           ['Ex', 'Gd'...
                           ['Ex', 'Gd', 'TA', 'Fa', 'Po'],
                           ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev',
                            'Sal'],
                           ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'None'],
                           ['Fin', 'RFn', 'Unf', 'None'],
                           ['Ex', 'Gd', 'TA', 'Fa

Fitting a column transformer that includes the one hot encoder and ordinal encoder

In [28]:
column_transformer = make_column_transformer(
  (one_hot_encoder, nominal_columns),
  (ordinal_encoder, ordinal_columns),
  remainder='passthrough'
)

column_transformer.fit(data.drop(columns=['SalePrice']), data['SalePrice'])

ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder',
                                 OneHotEncoder(categories=[['A', 'C (all)',
                                                            'FV', 'I', 'RH',
                                                            'RL', 'RP', 'RM'],
                                                           ['Grvl', 'Pave'],
                                                           ['Grvl', 'Pave',
                                                            'None'],
                                                           ['AllPub', 'NoSewr',
                                                            'NoSeWa', 'ELO'],
                                                           ['Inside', 'Corner',
                                                            'CulDSac', 'FR2',
                                                            'FR3'],
                                                           ['Blmngtn'

Preparing our pipelines to test

In [29]:
gbt_pipeline = make_pipeline(
  column_transformer,
  GradientBoostingRegressor(random_state=2002)
)

rf_pipeline = make_pipeline(
  column_transformer,
  RandomForestRegressor(n_jobs=-1, random_state=2002)
)

knn_pipeline = make_pipeline(
  column_transformer,
  KNeighborsRegressor(n_jobs=-1)
)

poly_pipeline = make_pipeline(
  column_transformer,
  Normalizer(),
  PolynomialFeatures(interaction_only=True),
  LinearRegression(n_jobs=-1)
)

In [30]:
gbt_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'gradientboostingregressor', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__onehotencoder', 'columntransformer__ordinalencoder', 'columntransformer__onehotencoder__categories', 'columntransformer__onehotencoder__drop', 'columntransformer__onehotencoder__dtype', 'columntransformer__onehotencoder__handle_unknown', 'columntransformer__onehotencoder__sparse', 'columntransformer__ordinalencoder__categories', 'columntransformer__ordinalencoder__dtype', 'columntransformer__ordinalencoder__handle_unknown', 'columntransformer__ordinalencoder__unknown_value', 'gradientboostingregressor__alpha', 'gradientboostingregressor__ccp_alpha', 'gradientboostingregressor__criterion', 'gradientboostingregressor__init', 'gr

In [31]:
rf_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'randomforestregressor', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__onehotencoder', 'columntransformer__ordinalencoder', 'columntransformer__onehotencoder__categories', 'columntransformer__onehotencoder__drop', 'columntransformer__onehotencoder__dtype', 'columntransformer__onehotencoder__handle_unknown', 'columntransformer__onehotencoder__sparse', 'columntransformer__ordinalencoder__categories', 'columntransformer__ordinalencoder__dtype', 'columntransformer__ordinalencoder__handle_unknown', 'columntransformer__ordinalencoder__unknown_value', 'randomforestregressor__bootstrap', 'randomforestregressor__ccp_alpha', 'randomforestregressor__criterion', 'randomforestregressor__max_depth', 'randomforestr

In [32]:
knn_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'kneighborsregressor', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__onehotencoder', 'columntransformer__ordinalencoder', 'columntransformer__onehotencoder__categories', 'columntransformer__onehotencoder__drop', 'columntransformer__onehotencoder__dtype', 'columntransformer__onehotencoder__handle_unknown', 'columntransformer__onehotencoder__sparse', 'columntransformer__ordinalencoder__categories', 'columntransformer__ordinalencoder__dtype', 'columntransformer__ordinalencoder__handle_unknown', 'columntransformer__ordinalencoder__unknown_value', 'kneighborsregressor__algorithm', 'kneighborsregressor__leaf_size', 'kneighborsregressor__metric', 'kneighborsregressor__metric_params', 'kneighborsregressor__n

In [33]:
poly_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'normalizer', 'polynomialfeatures', 'linearregression', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__onehotencoder', 'columntransformer__ordinalencoder', 'columntransformer__onehotencoder__categories', 'columntransformer__onehotencoder__drop', 'columntransformer__onehotencoder__dtype', 'columntransformer__onehotencoder__handle_unknown', 'columntransformer__onehotencoder__sparse', 'columntransformer__ordinalencoder__categories', 'columntransformer__ordinalencoder__dtype', 'columntransformer__ordinalencoder__handle_unknown', 'columntransformer__ordinalencoder__unknown_value', 'normalizer__copy', 'normalizer__norm', 'polynomialfeatures__degree', 'polynomialfeatures__include_bias', 'polynomialfeatures__

Setting the search grid for all pipelines

In [34]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
max_depth = [int(x) for x in np.linspace(5, 74, num = 10)]
max_depth.append(None)
min_samples_split = [5, 10, 16]
min_samples_leaf = [2, 4, 7]
learning_rate = [0.01, 0.1]

gbt_random_grid = {'gradientboostingregressor__n_estimators': n_estimators,
               'gradientboostingregressor__max_depth': max_depth,
               'gradientboostingregressor__min_samples_split': min_samples_split,
               'gradientboostingregressor__min_samples_leaf': min_samples_leaf,
               'gradientboostingregressor__learning_rate': learning_rate}

max_features = [40, 50, 60, 70]

rf_random_grid = {'randomforestregressor__n_estimators': n_estimators,
               'randomforestregressor__max_depth': max_depth,
               'randomforestregressor__min_samples_split': min_samples_split,
               'randomforestregressor__min_samples_leaf': min_samples_leaf,
               'randomforestregressor__max_features': max_features}

knn_random_grid = {
  "kneighborsregressor__n_neighbors": [1, 3, 5, 7, 9, 11]
}

poly_random_grid = {
  "polynomialfeatures__degree": [1]
}


In [35]:
def nested_cv(data, model, params, target, num_of_folds = 5, num_of_iters = 10, debug_messages = True, debug_param_messages = False):
  best_model = None
  best_score = 0
  best_params = None
  count = 1
  X = data.drop(columns=[target])
  y = data[target]

  for train_index, test_index in KFold(n_splits=num_of_folds).split(data):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    inner_best_model = RandomizedSearchCV(
      estimator=model,
      n_iter=num_of_iters,
      param_distributions=params,
      verbose=0,
      cv = 5,
      random_state=2002,
      n_jobs=-1
    )
    inner_best_model.fit(X_train, y_train)

    best_params = inner_best_model.best_params_
    current_model = model.set_params(**best_params)
    current_model.fit(X_train, y_train)
    score = current_model.score(X_test, y_test)

    if score > best_score:
      best_score = score
      best_params = best_params
      best_model = inner_best_model.best_estimator_
      
    if(debug_messages):
      print(f"[Fold {count}/{num_of_folds}] Params {best_params} with score of {score}")
    count += 1

  return {
    "best_model": best_model,
    "best_score": best_score,
    "best_params": best_params
  }

In [36]:
pipelines = [(poly_pipeline, poly_random_grid), (gbt_pipeline, gbt_random_grid), (rf_pipeline, rf_random_grid), (knn_pipeline, knn_random_grid)]

Applying nested CV to all pipelines and saving the best one

In [37]:
best_model = None
for pipeline in pipelines:
  current_pipeline = nested_cv(data, pipeline[0], pipeline[1], 'SalePrice', 5, 5, True, True)
  if best_model == None or best_model["best_score"] < current_pipeline["best_score"]:
    best_model = current_pipeline



[Fold 1/5] Params {'polynomialfeatures__degree': 1} with score of -64108098957202.62




[Fold 2/5] Params {'polynomialfeatures__degree': 1} with score of -361189910700091.3




[Fold 3/5] Params {'polynomialfeatures__degree': 1} with score of -411078667084631.44




[Fold 4/5] Params {'polynomialfeatures__degree': 1} with score of -2.2262920636615212e+16




[Fold 5/5] Params {'polynomialfeatures__degree': 1} with score of -3647274017052627.0
[Fold 1/5] Params {'gradientboostingregressor__n_estimators': 500, 'gradientboostingregressor__min_samples_split': 16, 'gradientboostingregressor__min_samples_leaf': 2, 'gradientboostingregressor__max_depth': 58, 'gradientboostingregressor__learning_rate': 0.01} with score of 0.8680043000375807
[Fold 2/5] Params {'gradientboostingregressor__n_estimators': 500, 'gradientboostingregressor__min_samples_split': 16, 'gradientboostingregressor__min_samples_leaf': 2, 'gradientboostingregressor__max_depth': 58, 'gradientboostingregressor__learning_rate': 0.01} with score of 0.8715327665682651
[Fold 3/5] Params {'gradientboostingregressor__n_estimators': 500, 'gradientboostingregressor__min_samples_split': 16, 'gradientboostingregressor__min_samples_leaf': 2, 'gradientboostingregressor__max_depth': 58, 'gradientboostingregressor__learning_rate': 0.01} with score of 0.8712015327632391
[Fold 4/5] Params {'gradie

In [38]:
best_model["best_score"]

0.8987141287117575

In [39]:
model = best_model["best_model"]

Retrain the best model on all the training data

In [40]:
model.fit(data.drop(columns=['SalePrice']), data['SalePrice'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories=[['A',
                                                                             'C '
                                                                             '(all)',
                                                                             'FV',
                                                                             'I',
                                                                             'RH',
                                                                             'RL',
                                                                             'RP',
                                                                             'RM'],
                                                                            ['Grvl',
  

Pickle the model

In [41]:
import pickle

pickle.dump(model, open('./pipeline.model', 'wb'))