# Titanic 6: Model Selection

## 1. Preprocessing Pipeline

## Importing 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

## Reading the csv File

In [None]:
# reading
url = "https://drive.google.com/file/d/19URNvJqMVhZH9A_l9q8XU31klPD1px3K/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

## Exploring Dataset

In [None]:
data.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Id', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrTyp

In [None]:
data1 = data.copy()

In [None]:
id = data1.pop("Id")

In [None]:
id

0          1
1          2
2          3
3          4
4          5
        ... 
1455    1456
1456    1457
1457    1458
1458    1459
1459    1460
Name: Id, Length: 1460, dtype: int64

In [None]:
data1.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'B

## X and y Creation, Splitting and building pipeline

In [None]:
# X and y creation
X = data1
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# PREPROCESSOR

# Ordinal Encoding

In [None]:
# building the pipeline
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore")
)

In [None]:
ExterQual_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats =  ["Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats =  ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats =     ["NA",'IR3','IR2','IR1','Reg']
HeatingQC_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats =       ["NA","Fa", "TA", "Gd", "Ex"]
Fence_cats =        ["NA",'MnWw','GdWo','MnPrv','GdPrv']
Utilities_cats =    ["NA","ELO","NoSeWa","NoSewr","AllPub"]
CentralAir_cats =   ["NA","N","Y"]
Functional_cats =   ["NA", "Sal", "Sev","Maj2","Maj1", "Mod", "Min2", "Min1", "Typ"]
BsmtFinType2_cats = ["NA", "Unf","LwQ","Rec","BLQ","ALQ", "GLQ"]
LandContour_cats =  ["NA","Low", "HLS", "Bnk", "Lvl"]

ordinal_cats1 = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
                 BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, 
                 FireplaceQu_cats, LotShape_cats, HeatingQC_cats, GarageFinish_cats, 
                 GarageQual_cats, GarageCond_cats, PoolQC_cats, 
                 Fence_cats, Utilities_cats, CentralAir_cats, 
                 Functional_cats, BsmtFinType2_cats, LandContour_cats] 

In [None]:
### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
ordinal_cols = X_cat.columns.get_indexer(['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'LotShape',
       'HeatingQC', 'GarageFinish', 'GarageQual', 'GarageCond', 
       'PoolQC', 'Fence', 'Utilities', 'CentralAir', 'Functional', 'BsmtFinType2', 'LandContour' ])

ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_cols)))


categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats1), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

## 2. Modelling (Decision Tree)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

full_pipeline = make_pipeline(full_preprocessing, 
                              scaler,
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

dec_search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

dec_search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores = {"dtree" : dec_search.best_score_}

scores

Fitting 5 folds for each of 60 candidates, totalling 300 fits


{'dtree': 0.9392061919958914}

In [None]:
dec_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 3}

In [None]:
y_test_pred_dec = dec_search.predict(X_test)
accuracy_score(y_test, y_test_pred_dec)

0.9246575342465754

# RANDOM CLASSIFIER

In [None]:
from sklearn.ensemble import RandomForestClassifier
scaler = StandardScaler()
rfc = RandomForestClassifier(n_estimators=3, 
                             max_depth=2
                             )

rfc_full_pipeline = make_pipeline(full_preprocessing,
                                  scaler,                            
                                  rfc
                                 )

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "randomforestclassifier__max_depth": range(2, 14, 2),
    "randomforestclassifier__min_samples_leaf": range(3, 12, 2),
    "randomforestclassifier__criterion":["gini", "entropy"]
  #  "randomforestclassifier__weights": ["uniform", "distance"]
}

rfc_search = GridSearchCV(rfc_full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

rfc_search.fit(X_train, y_train)
scores["rfc"] = rfc_search.best_score_

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [None]:
rfc_search.best_score_

0.9383624958732254

In [None]:
y_test_pred_rfc = rfc_search.predict(X_test)
accuracy_score(y_test, y_test_pred_rfc)

0.9486301369863014

# KNN

In [None]:
# solution
from sklearn.neighbors import KNeighborsClassifier

knn_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  KNeighborsClassifier()
                                 )

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

knn_search = GridSearchCV(knn_full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

knn_search.fit(X_train, y_train)

scores["knn"] = knn_search.best_score_

Fitting 5 folds for each of 192 candidates, totalling 960 fits


In [None]:
knn_search.best_score_

0.9298008143501706

In [None]:
knn_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'kneighborsclassifier__n_neighbors': 6,
 'kneighborsclassifier__weights': 'distance'}

In [None]:
y_test_pred_knn = knn_search.predict(X_test)

accuracy_score(y_test, y_test_pred_knn)

0.9623287671232876

In [None]:
pd.DataFrame(scores, index=["best_score_"])

Unnamed: 0,dtree,rfc,knn
best_score_,0.93834,0.935769,0.929801


In [None]:
#other models:
#from sklearn.model_selection import train_test_split
#from sklearn.datasets import make_moons, make_circles, make_classification

#from sklearn.neural_network import MLPClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


full_pipeline = make_pipeline(full_preprocessing, 
                              scaler,
                              SVC(gamma='auto'))

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2)
}

svm_search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      scoring='accuracy',
                      verbose=1)

svm_search.fit(X_train, y_train)