In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

In [51]:
url = "https://drive.google.com/file/d/1t3Rxpb5Hr0baI1KZWSvrAujv68A_CdnK/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

In [52]:
# Dropping ID
data1 = data.copy()
id = data1.pop("Id")

# X and y creation
X = data1
y = X.pop("SalePrice")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
from sklearn.feature_selection import VarianceThreshold
import warnings
warnings.filterwarnings('ignore')

In [61]:
X_train.var()

MSSubClass       1.808959e+03
LotFrontage      6.198616e+02
LotArea          1.157640e+08
OverallQual      1.870382e+00
OverallCond      1.245594e+00
YearBuilt        9.409860e+02
YearRemodAdd     4.298969e+02
MasVnrArea       2.994016e+04
BsmtFinSF1       2.107462e+05
BsmtFinSF2       2.503278e+04
BsmtUnfSF        1.992413e+05
TotalBsmtSF      1.941956e+05
1stFlrSF         1.495178e+05
2ndFlrSF         1.932226e+05
LowQualFinSF     2.293175e+03
GrLivArea        2.750296e+05
BsmtFullBath     2.703682e-01
BsmtHalfBath     5.583263e-02
FullBath         2.991132e-01
HalfBath         2.499530e-01
BedroomAbvGr     6.477914e-01
KitchenAbvGr     5.082697e-02
TotRmsAbvGrd     2.623446e+00
Fireplaces       4.161913e-01
GarageYrBlt      6.115204e+02
GarageCars       5.478388e-01
GarageArea       4.456126e+04
WoodDeckSF       1.681844e+04
OpenPorchSF      4.821022e+03
EnclosedPorch    3.854327e+03
3SsnPorch        9.934892e+02
ScreenPorch      3.122703e+03
PoolArea         1.734598e+03
MiscVal   

In [59]:
constant_feature_selector = VarianceThreshold(threshold = 0.25 )

In [46]:
# building the pipeline
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(SimpleImputer())



### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
ordinal_cols_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                      'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'LotShape', 
                      'HeatingQC', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 
                      'Fence','Utilities', 'CentralAir', 'Functional', 'BsmtFinType2', 'LandContour']

ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)
ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_cols_names)))

X_cat_ordinal = X_cat.columns[ordinal_cols]
X_cat_ohe = X_cat.columns[ohe_cols]

# Ordinal Encoding - creation of categorical data
ExterQual_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats =  ["Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats =  ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats =     ["NA",'IR3','IR2','IR1','Reg']
HeatingQC_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats =       ["NA","Fa", "TA", "Gd", "Ex"]
Fence_cats =        ["NA",'MnWw','GdWo','MnPrv','GdPrv']
Utilities_cats =    ["NA","ELO","NoSeWa","NoSewr","AllPub"]
CentralAir_cats =   ["NA","N","Y"]
Functional_cats =   ["NA", "Sal", "Sev","Maj2","Maj1", "Mod", "Min2", "Min1", "Typ"]
BsmtFinType2_cats = ["NA", "Unf","LwQ","Rec","BLQ","ALQ", "GLQ"]
LandContour_cats =  ["NA","Low", "HLS", "Bnk", "Lvl"]

ordinal_cats1 = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
                 BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, 
                 FireplaceQu_cats, LotShape_cats, 
                 HeatingQC_cats, GarageFinish_cats, GarageQual_cats, GarageCond_cats, 
                 PoolQC_cats, Fence_cats, Utilities_cats, CentralAir_cats, 
                 Functional_cats, BsmtFinType2_cats, LandContour_cats] 


        ### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats1), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder)

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

In [58]:
len(X_num.columns)

36

# Variance Threshold

NOTE: Where to add variance threshold thing in the pipeline

final_pipe = make_pipeline(
    cat_num_preprocessor,
    StandardScaler(),
    VarianceThreshold(),
    SelectKBest(f_regression),
    Ridge()
)

In [None]:
X_train

In [49]:
constant_feature_selector.fit(X_num)

VarianceThreshold(threshold=1.0)

In [33]:
constant_feature_selector.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
       False, False, False, False,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

#### Note: 

StandardScaler removes the mean and scales each feature/variable to unit variance. This operation is performed feature-wise in an independent way. StandardScaler can be influenced by outliers (if they exist in the dataset) since it involves the estimation of the empirical mean and standard deviation of each feature

# Decision Tree Regressor

In [62]:
from sklearn.feature_selection import VarianceThreshold

In [72]:
from sklearn.feature_selection import SelectKBest, chi2, f_regression

In [118]:
full_pipeline = make_pipeline(full_preprocessing, 
                              VarianceThreshold(threshold = 0.00124444),
                             # StandardScaler(),
                              SelectKBest(score_func=f_regression, k=20),  
                              DecisionTreeRegressor(random_state = 123))

param_grid = {
   "decisiontreeregressor__max_depth": range(2, 10),
  # "decisiontreeregressor_min_samples_leaf": range(2,10),
   "decisiontreeregressor__criterion":["squared_error", "absolute_error", "poisson"]
}

search = RandomizedSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="neg_root_mean_squared_error", #and all other metrics
                      random_state=0,
                      n_iter=100)

# fit
search.fit(X_train, y_train)


scores = {"dtree" :search.best_score_}
best_params = {"dtree_bestparams": search.best_params_}

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [116]:
predicted_price = search.predict(X_test)
dt_mae = mean_absolute_error(y_true = y_test, y_pred = predicted_price)
dt_rmse = mean_squared_error(y_true = y_test, y_pred = predicted_price, squared=False)
dt_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = predicted_price)
dt_r2 = r2_score(y_true = y_test, y_pred = predicted_price)

In [117]:
pd.DataFrame({"MAE": [dt_mae],
              "RMSE": [ dt_rmse],
              "MAPE": [ dt_mape],
              "R2": [dt_r2]},
             index=["decision_tree"])

Unnamed: 0,MAE,RMSE,MAPE,R2
decision_tree,26698.684932,42814.888636,0.158608,0.761012


# KNN Regressor