# FLOW BY LIANE
1. copy existing notebook (so that you have your models)
2. import the competition_data with pd.read_csv() (or with the url etc.)
3. drop the column "id" and store it in a variable
4. run the code that you would also run for X_test
5. there should be a y_pred come out of these operations
6. merge the id with y_pred into a dataframe
7. store this dataframe as a file pd.to_csv()
8. upload manually to Bens Website

# 1. Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

## 1.1. Preprocessing

In [2]:
# reading
url = "https://drive.google.com/file/d/19URNvJqMVhZH9A_l9q8XU31klPD1px3K/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)


# Dropping ID
data1 = data.copy()
id = data1.pop("Id")

# X and y creation
X = data1
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Competition data

## Reading of Competition data:

take out the Id column

In [None]:
# reading
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
comp_data1 = pd.read_csv(path)

In [None]:
comp_data = comp_data1.copy()

In [None]:
X_comp = comp_data


In [None]:
house_id = X_comp.pop("Id")

In [None]:
X_comp.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSZoning',
       'Condition1', 'Heating', 'Street', 'CentralAir', 'Foundation',
       'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal', 'MoSold',
       'YrSold', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtFinType2',

ordinal_col_names = ['ExterQual','ExterCond','BsmtQual', 'BsmtCond','BsmtExposure','BsmtFinType1', 
'KitchenQual','FireplaceQu','LotShape','BsmtFinType2',
'HeatingQC','GarageFinish','GarageQual','GarageCond', 
'PoolQC', 'Fence']

['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure','BsmtFinType1', 'KitchenQual', 'FireplaceQu','LotShape','HeatingQC', 'GarageFinish', 'GarageQual', 'GarageCond','PoolQC', 'Fence', 'Utilities', 'CentralAir', 'Functional', 'BsmtFinType2', 'LandContour' ]

In [None]:
X_num = X.select_dtypes(include="number")

In [None]:
X_num.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal', 'MoSold',
       'YrSold'],
      dtype='object')

## Preprocessing

In [None]:
# building the pipeline
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="constant"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)


# Ordinal Encoding - creation of categorical data
ExterQual_cats =    ["Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats =    ["Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats =  ["NA","Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats =  ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats =     ['IR3','IR2','IR1','Reg']
BsmtFinType2_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
HeatingQC_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats =       ["NA","Fa", "TA", "Gd", "Ex"]
Fence_cats =        ["NA",'MnWw','GdWo','MnPrv','GdPrv']



ordinal_cats1 = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, BsmtExposure_cats, 
                 BsmtFinType1_cats, KitchenQual_cats, 
                 FireplaceQu_cats, LotShape_cats, BsmtFinType2_cats, 
                 HeatingQC_cats, 
                 GarageFinish_cats, GarageQual_cats, GarageCond_cats, 
                 PoolQC_cats, Fence_cats] 


### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
ordinal_cols_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'LotShape','BsmtFinType2',
       'HeatingQC', 'GarageFinish', 'GarageQual', 'GarageCond', 
       'PoolQC', 'Fence']


ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)

ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_cols_names)))





categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats1), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)


## 1. decision tree model: GridsearchCV

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# full pipeline: preprocessor + model

full_pipeline = make_pipeline(full_preprocessing, StandardScaler(with_mean=False), DecisionTreeClassifier(random_state=123))

# define parameter grid
param_grid = {
    #"columntransformer__num_pipe__simpleimputer__strategy":["constant", "median"],
    #"standardscaler__with_mean":[True, False],
   # "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2),
    #"decisiontreeclassifier__min_samples_split": range(3, 40, 5),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

# define GridSearchCV
search = RandomizedSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      scoring="accuracy",
                      verbose=1,
                      n_jobs=-2, 
                      n_iter=100)

# fit
search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores = {"dtree" : search.best_score_}

scores



Fitting 5 folds for each of 60 candidates, totalling 300 fits


{'dtree': 0.9332086130369392}

#### Decision Tree Randomised

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# full pipeline: preprocessor + model
full_pipeline = make_pipeline(full_preprocessing,
                              StandardScaler(with_mean=False), #with_mean=False suggested by Liane
                              DecisionTreeClassifier(random_state=123))

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["constant", "median", "mean"],
   # "standardscaler__with_mean":[True, False],
    #"standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12, 2),
    "decisiontreeclassifier__min_samples_split": range(3, 40, 5),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

# define GridSearchCV
search_d_r = RandomizedSearchCV(full_pipeline,
                      param_grid,
                      n_iter = 100,
                      cv=5,
                      scoring="accuracy",
                      verbose=1)

# fit
search_d_r.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores = {"dtree" : search.best_score_}

scores

Fitting 5 folds for each of 100 candidates, totalling 500 fits


{'dtree': 0.9332086130369392}

#### CSV FILE

In [None]:
y_test_pred_comp = search.predict(X_comp)

In [None]:
y_test_pred_comp

array([0, 0, 0, ..., 1, 0, 0])

In [None]:
df_comp = pd.DataFrame(house_id)
df_comp["Expensive"] = y_test_pred_comp
df_comp

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,1
1457,2918,0


In [None]:
df_comp.to_csv("df_comp_sai_dec.csv", index=False)

In [None]:
from google.colab import files
files.download("df_comp_sai_dec.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Random Forest Model: Randomised

In [None]:
from sklearn.ensemble import RandomForestClassifier
scaler = StandardScaler()


rfc_full_pipeline = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),
                          RandomForestClassifier()
                             )
param_grid = {  
    "randomforestclassifier__n_estimators": [10, 20],
    "randomforestclassifier__max_depth": range(2, 14),
    "randomforestclassifier__min_samples_leaf": range(3, 12),
    "randomforestclassifier__criterion":["gini", "entropy"]

}

rfc_search = RandomizedSearchCV(rfc_full_pipeline,
                      param_grid,
                      cv=5,
                      n_jobs=-2, 
                      n_iter=50,
                      scoring='accuracy',
                      verbose=1)

rfc_search.fit(X_train, y_train)
scores["rfc"] = rfc_search.best_score_
rfc_search.best_score_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


0.9537617842338871

In [None]:
y_test_pred_comp_1 = rfc_search.predict(X_comp)

#### CSV File

In [None]:
df_comp_r = pd.DataFrame(house_id)
df_comp_r["Expensive"] = y_test_pred_comp_1
df_comp_r

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [None]:
df_comp_r.to_csv("df_comp_sai_r.csv", index=False)

In [None]:
from google.colab import files
files.download("df_comp_sai_r.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>