# Housing Prices: Model Selection

## 1. Preprocessing Pipeline

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import set_config
from sklearn.compose import ColumnTransformer

In [33]:
# reading
url = "https://drive.google.com/file/d/1NFHZhCOxgW1bu5q32OqVIVBDtSo2Alkh/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = df = pd.read_csv(path)
data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

In [35]:
data.isna().sum()

LotArea             0
LotFrontage       259
TotalBsmtSF         0
BedroomAbvGr        0
Fireplaces          0
                 ... 
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
Length: 81, dtype: int64

In [36]:
# define X and y
X = data.drop(columns="Id")
y = X.pop("Expensive")

In [37]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   MSZoning       1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Heating        1460 non-null   object 
 12  Street         1460 non-null   object 
 13  CentralAir     1460 non-null   object 
 14  Foundation     1460 non-null   object 
 15  ExterQual      1460 non-null   object 
 16  ExterCond      1460 non-null   object 
 17  BsmtQual       1423 non-null   object 
 18  BsmtCond

In [38]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1455    0
1456    0
1457    1
1458    0
1459    0
Name: Expensive, Length: 1460, dtype: int64

In [39]:
# data splitting (train - test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

In [40]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 772 to 1391
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1168 non-null   int64  
 1   LotFrontage    966 non-null    float64
 2   TotalBsmtSF    1168 non-null   int64  
 3   BedroomAbvGr   1168 non-null   int64  
 4   Fireplaces     1168 non-null   int64  
 5   PoolArea       1168 non-null   int64  
 6   GarageCars     1168 non-null   int64  
 7   WoodDeckSF     1168 non-null   int64  
 8   ScreenPorch    1168 non-null   int64  
 9   MSZoning       1168 non-null   object 
 10  Condition1     1168 non-null   object 
 11  Heating        1168 non-null   object 
 12  Street         1168 non-null   object 
 13  CentralAir     1168 non-null   object 
 14  Foundation     1168 non-null   object 
 15  ExterQual      1168 non-null   object 
 16  ExterCond      1168 non-null   object 
 17  BsmtQual       1141 non-null   object 
 18  BsmtCo

In [None]:
# X.select_dtypes(exclude="number").columns[[6,7,8,9,10,11,12,13,15,29,30,34,35,36,38,39]]

In [41]:
# defining categorical & ordinal columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

In [42]:
X_num.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal', 'MoSold',
       'YrSold'],
      dtype='object')

In [43]:
X_cat.columns

Index(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')

In [44]:
len(X_cat.columns)

43

**Competition Data**

In [45]:
# reading
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=sharing"
path_comp = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data_comp = df = pd.read_csv(path_comp)
data_comp

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
1455,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [46]:
data_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1459 non-null   int64  
 1   LotFrontage    1232 non-null   float64
 2   TotalBsmtSF    1458 non-null   float64
 3   BedroomAbvGr   1459 non-null   int64  
 4   Fireplaces     1459 non-null   int64  
 5   PoolArea       1459 non-null   int64  
 6   GarageCars     1458 non-null   float64
 7   WoodDeckSF     1459 non-null   int64  
 8   ScreenPorch    1459 non-null   int64  
 9   MSZoning       1455 non-null   object 
 10  Condition1     1459 non-null   object 
 11  Heating        1459 non-null   object 
 12  Street         1459 non-null   object 
 13  CentralAir     1459 non-null   object 
 14  Foundation     1459 non-null   object 
 15  ExterQual      1459 non-null   object 
 16  ExterCond      1459 non-null   object 
 17  BsmtQual       1415 non-null   object 
 18  BsmtCond

In [47]:
id_comp = data_comp.pop("Id")

X_test_comp = data_comp

In [48]:
X_test_comp

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
1455,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [49]:
X_test_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1459 non-null   int64  
 1   LotFrontage    1232 non-null   float64
 2   TotalBsmtSF    1458 non-null   float64
 3   BedroomAbvGr   1459 non-null   int64  
 4   Fireplaces     1459 non-null   int64  
 5   PoolArea       1459 non-null   int64  
 6   GarageCars     1458 non-null   float64
 7   WoodDeckSF     1459 non-null   int64  
 8   ScreenPorch    1459 non-null   int64  
 9   MSZoning       1455 non-null   object 
 10  Condition1     1459 non-null   object 
 11  Heating        1459 non-null   object 
 12  Street         1459 non-null   object 
 13  CentralAir     1459 non-null   object 
 14  Foundation     1459 non-null   object 
 15  ExterQual      1459 non-null   object 
 16  ExterCond      1459 non-null   object 
 17  BsmtQual       1415 non-null   object 
 18  BsmtCond

In [50]:
# numerical pipeline
numeric_pipe = make_pipeline(SimpleImputer(strategy="constant"))

# categorical pipeline

# defining ordinal & onehot columns
# .get_indexer() get's the index to solve the problem described above about losing column names

# ordinal_cols = [6,7,8,9,10,11,12,13,15,29,30,34,35,36,38,39]
# onehot_cols = list(set(X_cat)-set(ordinal_cols))
ordinal_cols = X_cat.columns.get_indexer(["ExterQual",	"ExterCond",	"BsmtQual",	"BsmtCond",	"BsmtExposure",	"BsmtFinType1", "BsmtFinType2",	"KitchenQual",	"FireplaceQu", "LotShape", "Utilities", "HeatingQC", "CentralAir", "Functional", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC", "Fence"])
# onehot_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_cols)))
onehot_cols = X_cat.columns.get_indexer([ "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "Electrical", "GarageType", "MiscFeature", "SaleType", "SaleCondition"])

# defining the categorical encoder

# we manually establish the order of the categories for our ordinal features

ExterQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["N_A", "NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["N_A", "NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["N_A", "NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["N_A", "NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
BsmtFinType2_cats = ["N_A", "NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["N_A", "NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats = ["N_A", "IR3", "IR2", "IR1", "Reg"]
Utilities_cats = ["N_A", "ELO", "NoSeWa", "NoSewr", "AllPub"]
HeatingQC_cats = ["N_A", "Po", "Fa", "TA", "Gd", "Ex"]
CentralAir_cats = ["N_A", "N", "Y"]
Functional_cats = ["N_A", "Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"]
GarageFinish_cats = ["N_A", "NA", "Unf", "RFn", "Fin"]
GarageQual_cats = ["N_A", "NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["N_A", "NA", "Po", "Fa", "TA", "Gd", "Ex"]
PavedDrive_cats = ["N_A", "N", "P", "Y"]
PoolQC_cats = ["N_A", "NA", "Fa", "TA", "Gd", "Ex"]
Fence_cats = ["N_A", "NA", "MnWw", "GdWo", "MnPrv", "GdPrv"]


cats_ord = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
            BsmtExposure_cats, BsmtFinType1_cats, BsmtFinType2_cats, KitchenQual_cats, FireplaceQu_cats, LotShape_cats, Utilities_cats, HeatingQC_cats, CentralAir_cats, 
            Functional_cats, GarageFinish_cats, GarageQual_cats, GarageCond_cats, PavedDrive_cats, PoolQC_cats, Fence_cats]

# defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=cats_ord), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols),
    ]
)

# categorical pipeline = "NA" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="N_A"),
                                 categorical_encoder
                                )

# full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

## 2. Modelling 

### 2.1 Decision Tree

In [51]:
# full pipeline: preprocessor + model
full_pipeline = make_pipeline(full_preprocessing, StandardScaler(), DecisionTreeClassifier())

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "constant"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2),
    #"decisiontreeclassifier__min_samples_split": range(3, 40, 5),
    #"decisiontreeclassifier__criterion":["gini", "entropy"]
}

# define GridSearchCV
search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      scoring="accuracy",
                      verbose=1)

# fit
search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores = {"dtree" : search.best_score_}

scores

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


{'dtree': 0.9315102160595721}

In [52]:
search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'constant',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 3,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': False}

In [53]:
# test accuracy
accuracy_score(search.predict(X_test), y_test)

0.928082191780822

In [57]:
# full pipeline: preprocessor + model
full_pipeline = make_pipeline(full_preprocessing, StandardScaler(), DecisionTreeClassifier())

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "constant"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2),
    "decisiontreeclassifier__min_samples_split": range(3, 40, 5),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

# define GridSearchCV
search_rand = RandomizedSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      scoring="accuracy",
                      verbose=1)

# fit
search_rand.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores_rand = {"dtree" : search_rand.best_score_}

scores_rand

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'dtree': 0.9272257070540333}

In [58]:
search_rand.best_params_

{'standardscaler__with_std': False,
 'standardscaler__with_mean': True,
 'decisiontreeclassifier__min_samples_split': 18,
 'decisiontreeclassifier__min_samples_leaf': 9,
 'decisiontreeclassifier__max_depth': 10,
 'decisiontreeclassifier__criterion': 'gini',
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [63]:
# test accuracy
accuracy_score(search_rand.predict(X_test), y_test)

0.928082191780822

**Competition**

In [64]:
y_test_pred = search_rand.predict(X_test_comp)

In [65]:
y_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [66]:
my_sub_df = pd.DataFrame(id_comp)
my_sub_df["Expensive"] = y_test_pred
my_sub_df

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [67]:
my_sub_df.to_csv("my_sub_dt_1.csv", index=False)

In [68]:
from google.colab import files
files.download("my_sub_dt_1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### 2.2 KNN Model

In [None]:
id_comp

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [None]:
# full pipeline: preprocessor + model
knn_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  KNeighborsClassifier()
                                 )

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "constant"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

# define GridSearchCV
knn_search = GridSearchCV(knn_full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

# fit
knn_search.fit(X_train, y_train)

# add to dictionary
scores["knn"] = knn_search.best_score_

scores

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


KeyboardInterrupt: ignored

In [None]:
knn_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'constant',
 'kneighborsclassifier__n_neighbors': 9,
 'kneighborsclassifier__weights': 'distance',
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True}

In [None]:
# test accuracy
accuracy_score(knn_search.predict(X_test), y_test)



0.9452054794520548

In [None]:
# full pipeline: preprocessor + model
knn_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  KNeighborsClassifier()
                                 )

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "constant"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

# define GridSearchCV
knn_search_rand = RandomizedSearchCV(knn_full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

# fit
knn_search_rand.fit(X_train, y_train)

# add to dictionary
scores_rand["knn"] = knn_search_rand.best_score_

scores_rand

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'dtree': 0.9272257070540333, 'knn': 0.9255236418326547}

In [None]:
knn_search_rand.best_params_

{'standardscaler__with_std': True,
 'standardscaler__with_mean': False,
 'kneighborsclassifier__weights': 'distance',
 'kneighborsclassifier__n_neighbors': 14,
 'columntransformer__num_pipe__simpleimputer__strategy': 'constant'}

In [None]:
# test accuracy
accuracy_score(knn_search_rand.predict(X_test), y_test)

0.9417808219178082

In [None]:
y_test_pred_knn = knn_search_rand.predict(X_test_comp)

In [None]:
my_sub_df_1 = pd.DataFrame(id_comp)
my_sub_df_1["Expensive"] = y_test_pred_knn

In [None]:
my_sub_df_1

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [None]:
my_sub_df_1.to_csv("my_sub_knn_1.csv", index=False)

In [None]:
from google.colab import files
files.download("my_sub_knn_1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### 2.3 Using RandomForest

In [69]:
# full pipeline: preprocessor + model
forest_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  RandomForestClassifier()
                                 )

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "constant"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "randomforestclassifier__n_estimators": [100, 200],
    "randomforestclassifier__max_depth": range(2, 14),
    "randomforestclassifier__min_samples_leaf": range(2, 10),
   # "randomforestclassifier__criterion":["gini", "entropy"]
}

# define GridSearchCV
forest_search = GridSearchCV(forest_full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

# fit
forest_search.fit(X_train, y_train)

# add to dictionary
scores["forest"] = forest_search.best_score_

scores

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits


KeyboardInterrupt: ignored

In [None]:
forest_search.best_params_

In [None]:
# test accuracy
accuracy_score(forest_search.predict(X_test), y_test)



0.9486301369863014

In [None]:
# full pipeline: preprocessor + model
forest_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  RandomForestClassifier()
                                 )

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "constant"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "randomforestclassifier__n_estimators": [10, 20],
    "randomforestclassifier__max_depth": range(2, 14),
    "randomforestclassifier__min_samples_leaf": range(2, 10),
    "randomforestclassifier__criterion":["gini", "entropy"]
}

# define GridSearchCV
forest_search_rand = RandomizedSearchCV(forest_full_pipeline,
                      param_grid,
                      n_iter = 20,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

# fit
forest_search_rand.fit(X_train, y_train)

# add to dictionary
scores_rand["forest"] = forest_search_rand.best_score_

scores_rand

Fitting 5 folds for each of 20 candidates, totalling 100 fits


{'dtree': 0.9272257070540333,
 'knn': 0.9255236418326547,
 'forest': 0.9460768130296028}

In [None]:
forest_search_rand.best_params_

{'standardscaler__with_std': True,
 'standardscaler__with_mean': False,
 'randomforestclassifier__n_estimators': 200,
 'randomforestclassifier__min_samples_leaf': 9,
 'randomforestclassifier__max_depth': 12,
 'randomforestclassifier__criterion': 'entropy',
 'columntransformer__num_pipe__simpleimputer__strategy': 'constant'}

In [None]:
# test accuracy
accuracy_score(forest_search_rand.predict(X_test), y_test)

0.9486301369863014

In [None]:
y_test_pred_rf = forest_search_rand.predict(X_test_comp)

In [None]:
my_sub_df_2 = pd.DataFrame(id_comp)
my_sub_df_2["Expensive"] = y_test_pred_rf

In [None]:
my_sub_df_2

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [None]:
my_sub_df_2.to_csv("my_sub_rf_1.csv", index=False)

In [None]:
from google.colab import files
files.download("my_sub_rf_1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**GridSearch**

In [None]:
# create a dataframe of scores
pd.DataFrame(scores, index=["best_score_"])

Unnamed: 0,dtree,knn,forest
best_score_,0.930659,0.92808,0.952925


**RandomizedSearch**

In [None]:
# create a dataframe of scores_rand
pd.DataFrame(scores_rand, index=["best_score_"])

Unnamed: 0,dtree,knn,forest
best_score_,0.929786,0.922094,0.947779
