In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor

In [27]:
url = "https://drive.google.com/file/d/1t3Rxpb5Hr0baI1KZWSvrAujv68A_CdnK/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

In [28]:
data.SalePrice

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [29]:
# Dropping ID
data1 = data.copy()
id = data1.pop("Id")

# X and y creation
X = data1
y = X.pop("SalePrice")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
X.shape

(1460, 79)

In [31]:
# building the pipeline
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="constant"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)


# Ordinal Encoding - creation of categorical data
ExterQual_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats =  ["Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats =  ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats =     ["NA",'IR3','IR2','IR1','Reg']
HeatingQC_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats =       ["NA","Fa", "TA", "Gd", "Ex"]
Fence_cats =        ["NA",'MnWw','GdWo','MnPrv','GdPrv']
Utilities_cats =    ["NA","ELO","NoSeWa","NoSewr","AllPub"]
CentralAir_cats =   ["NA","N","Y"]
Functional_cats =   ["NA", "Sal", "Sev","Maj2","Maj1", "Mod", "Min2", "Min1", "Typ"]
BsmtFinType2_cats = ["NA", "Unf","LwQ","Rec","BLQ","ALQ", "GLQ"]
LandContour_cats =  ["NA","Low", "HLS", "Bnk", "Lvl"]

ordinal_cats1 = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
                 BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, 
                 FireplaceQu_cats, LotShape_cats, 
                 HeatingQC_cats, GarageFinish_cats, GarageQual_cats, GarageCond_cats, 
                 PoolQC_cats, Fence_cats, Utilities_cats, CentralAir_cats, 
                 Functional_cats, BsmtFinType2_cats, LandContour_cats] 


### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
ordinal_cols_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                      'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'LotShape', 
                      'HeatingQC', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 
                      'Fence','Utilities', 'CentralAir', 'Functional', 'BsmtFinType2', 'LandContour']

ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)

ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_cols_names)))



categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats1), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

# Decision Tree Regressor

In [36]:
full_pipeline = make_pipeline(full_preprocessing,
                              StandardScaler(with_mean=False),                         
                              DecisionTreeRegressor(random_state = 123)
                                 )


param_grid = {
   "decisiontreeregressor__max_depth": range(2, 10),
 #  "decisiontreeregressor_min_samples_leaf": range(2,10),
    "decisiontreeregressor__criterion":["squared_error", "absolute_error", "poisson"]
}

search = RandomizedSearchCV(full_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="neg_root_mean_squared_error", #and all other metrics
                      random_state=0,
                      n_iter=100
)

# fit
search.fit(X_train, y_train)


scores = {"dtree" :search.best_score_}
best_params = {"dtree_bestparams": search.best_params_}



Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [37]:
predicted_price = search.predict(X_test)

In [38]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

In [40]:
dt_mae = mean_absolute_error(y_true = y_test, y_pred = predicted_price)
dt_rmse = mean_squared_error(y_true = y_test, y_pred = predicted_price, squared=False)
dt_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = predicted_price)
dt_r2 = r2_score(y_true = y_test, y_pred = predicted_price)

In [41]:
pd.DataFrame({"MAE": [dt_mae],
              "RMSE": [ dt_rmse],
              "MAPE": [ dt_mape],
              "R2": [dt_r2]},
             index=["decision_tree"])

Unnamed: 0,MAE,RMSE,MAPE,R2
decision_tree,26813.566796,37906.126204,0.167015,0.812671


# SGDRegressor

In [52]:
from sklearn.linear_model import SGDRegressor

sgd_pipeline = make_pipeline(preprocessor, 
                            StandardScaler(with_mean=False),
                            SGDRegressor())



#sgd_pipeline.fit(X_train, y_train)


param_grid = {
 #  "sgdregressor__max_depth": range(2, 10),
 # "sgdregressor_min_samples_leaf": range(2,10),
 # "sgdregressor__criterion":["squared_error", "absolute_error", "poisson"]
}

sgd_search = RandomizedSearchCV(sgd_pipeline,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="neg_root_mean_squared_error", #and all other metrics
                      random_state=0,
                      n_iter=100
)

# fit
sgd_search.fit(X_train, y_train)


scores = {"sgd_" :sgd_search.best_score_}
best_params = {"sgd_bestparams": sgd_search.best_params_}





Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [53]:
sgd_predictions = sgd_search.predict(X_test)

In [54]:
sgd_mae = mean_absolute_error(y_true = y_test, y_pred = sgd_predictions)
sgd_rmse = mean_squared_error(y_true = y_test, y_pred = sgd_predictions, squared=False)
sgd_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = sgd_predictions)
sgd_r2 = r2_score(y_true = y_test, y_pred = sgd_predictions)

# LinearRegressor

In [56]:
from sklearn.linear_model import LinearRegression

lr_pipeline = make_pipeline(preprocessor, 
                            StandardScaler(with_mean=False),
                            LinearRegression())

lr_pipeline.fit(X_train, y_train)

lr_predictions = lr_pipeline.predict(X_test)

lr_mae = mean_absolute_error(y_true = y_test, y_pred = lr_predictions)
lr_rmse = mean_squared_error(y_true = y_test, y_pred = lr_predictions, squared=False)
lr_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = lr_predictions)
lr_r2 = r2_score(y_true = y_test, y_pred = lr_predictions)

In [57]:
comparing_models_df = pd.DataFrame({"MAE": [dt_mae, sgd_mae, lr_mae],
                                    "RMSE": [dt_rmse, sgd_rmse, lr_rmse],
                                    "MAPE": [dt_mape, sgd_mape, lr_mape],
                                    "R2": [dt_r2, sgd_r2, lr_r2]},
                                    index=["decision_tree", "sgd", "linear_regression"])

comparing_models_df

Unnamed: 0,MAE,RMSE,MAPE,R2
decision_tree,26813.57,37906.13,0.1670153,0.812671
sgd,464065700000000.0,464199800000000.0,3153452000.0,-2.809287e+19
linear_regression,21088.21,65291.9,0.1282577,0.4442176
