In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

In [124]:
url = "https://drive.google.com/file/d/1c5210z-HSQ-Kv4EExbYIjjPq6lT0UYZW/view?usp=sharing" #housing prices iter 7
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
house = pd.read_csv(path)
house = house.sample(100).copy() # small sample to quickly test the pipeline

X = house.drop(columns=['Id','SalePrice','MoSold', 'YrSold'])
y = house['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=31421)

In [125]:
X.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [126]:
# building the pipeline
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="constant"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

### 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
ordinal_cols_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                      'BsmtFinType1', 'KitchenQual', 'FireplaceQu', 'LotShape', 
                      'HeatingQC', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 
                      'Fence','Utilities', 'CentralAir', 'Functional', 'BsmtFinType2', 'LandContour']

ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)
ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_cols_names)))

X_cat_ordinal = X_cat.columns[ordinal_cols]
X_cat_ohe = X_cat.columns[ohe_cols]

# Ordinal Encoding - creation of categorical data
ExterQual_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats =     ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats =  ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats =  ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats =     ["NA",'IR3','IR2','IR1','Reg']
HeatingQC_cats =    ["NA","Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats =   ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats =       ["NA","Fa", "TA", "Gd", "Ex"]
Fence_cats =        ["NA",'MnWw','GdWo','MnPrv','GdPrv']
Utilities_cats =    ["NA","ELO","NoSeWa","NoSewr","AllPub"]
CentralAir_cats =   ["NA","N","Y"]
Functional_cats =   ["NA", "Sal", "Sev","Maj2","Maj1", "Mod", "Min2", "Min1", "Typ"]
BsmtFinType2_cats = ["NA", "Unf","LwQ","Rec","BLQ","ALQ", "GLQ"]
LandContour_cats =  ["NA","Low", "HLS", "Bnk", "Lvl"]

ordinal_cats1 = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
                 BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, 
                 FireplaceQu_cats, LotShape_cats, 
                 HeatingQC_cats, GarageFinish_cats, GarageQual_cats, GarageCond_cats, 
                 PoolQC_cats, Fence_cats, Utilities_cats, CentralAir_cats, 
                 Functional_cats, BsmtFinType2_cats, LandContour_cats] 


ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)

ohe_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_cols_names)))

categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats1), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), ohe_cols),
    ]
)

## 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

In [127]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression

In [128]:
param_grid = {
    "columntransformernum_pipesimpleimputerstrategy":["mean", "median", "constant"],
    "standardscalerwith_mean":[True, False],
    "standardscaler__with_std":[True, False],
    #'var_thresholdthreshold': [0, 0.01, 0.02]
    #"pcan_components": range(10, 310, 20),
    "selectkbestk": range(5, 85, 5),
    "gradientboostingregressorloss": ["squared_error", "absolute_error", "huber", "quantile"],
    "gradientboostingregressorcriterion": ["friedman_mse", "squared_error"],
    "gradientboostingregressor__n_estimators": range(50, 150, 20)
}

In [129]:

full_pipline = make_pipeline(full_preprocessing,
                              StandardScaler(with_mean=False),
                              VarianceThreshold(),
                              SelectKBest(f_regression),
                              GradientBoostingRegressor(random_state = 1))

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median", "constant"],
    "standardscaler__with_mean":[True, False],
    "gradientboostingregressor__loss": ["squared_error", "absolute_error", "huber", "quantile"],
    "gradientboostingregressor__criterion": ["friedman_mse", "squared_error"],
    "gradientboostingregressor__n_estimators": range(50, 150, 20),
    "selectkbest__k": range(15, 85, 5)
}

gb_search = RandomizedSearchCV(full_pipline,
                              param_grid,
                              cv=5,
                              n_iter = 200,
                              scoring='neg_root_mean_squared_error',
                              verbose=1)

gb_search.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='constant'))]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF',...
                                                                                                 'constant'],
                                        'gradientboostingregressor__criterion': ['friedman_mse',
                                                                                 'squared_

In [130]:
gb_search.best_score_

-34718.07438190753

In [131]:
gb_search.best_params_

{'standardscaler__with_mean': False,
 'selectkbest__k': 25,
 'gradientboostingregressor__n_estimators': 110,
 'gradientboostingregressor__loss': 'huber',
 'gradientboostingregressor__criterion': 'friedman_mse',
 'columntransformer__num_pipe__simpleimputer__strategy': 'constant'}

In [132]:
gb_search.estimator.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'standardscaler', 'variancethreshold', 'selectkbest', 'gradientboostingregressor', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__num_pipe', 'columntransformer__cat_pipe', 'columntransformer__num_pipe__memory', 'columntransformer__num_pipe__steps', 'columntransformer__num_pipe__verbose', 'columntransformer__num_pipe__simpleimputer', 'columntransformer__num_pipe__simpleimputer__add_indicator', 'columntransformer__num_pipe__simpleimputer__copy', 'columntransformer__num_pipe__simpleimputer__fill_value', 'columntransformer__num_pipe__simpleimputer__missing_values', 'columntransformer__num_pipe__simpleimputer__strategy', 'columntransformer__num_pipe__simpleimputer__verbose', 'columntransformer__cat_pipe__m

## CSV FILE FOR GRADIENT BOOSTING REGRESSOR

In [135]:
url = "https://drive.google.com/file/d/1jnn7sVeWjrKyWe2DDkpbtGpM-vCmWsnW/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)

In [136]:
my_test_X = competition_data.drop(columns=["Id"])

In [137]:
my_submission = pd.DataFrame(competition_data["Id"])
my_submission["SalePrice"] = gb_search.predict(my_test_X)

my_submission.to_csv('my_submission_1.csv', index=False)

# Extras needed on colab
from google.colab import files
files.download("my_submission_1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# DECISION TREE REGRESSOR

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression

final_pipe_gb = make_pipeline(full_preprocessing,
                              StandardScaler(with_mean=False),
                              SelectKBest(score_func=f_regression),
                              VarianceThreshold(),
                               DecisionTreeRegressor(random_state = 123))

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
   "decisiontreeregressor__max_depth": range(2, 10),
    "decisiontreeregressor__criterion":["squared_error", "absolute_error", "poisson"],
  # "decisiontreeregressor__min_samples_leaf": range(50, 150, 20),
    "selectkbest__k": range(5, 85, 5)
}

dec_search = RandomizedSearchCV(final_pipe_gb,
                              param_grid,
                              cv=5,
                             n_iter = 100,
                              scoring='neg_root_mean_squared_error',
                              verbose=1)

dec_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
 

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='constant'))]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF',...
                                              DecisionTreeRegressor(random_state=123))]),
                   n_iter=100,
                   param_distributions={'columntransformer__num_pipe__simpleimputer__strategy': ['mean',
                                                                       

In [62]:
dec_search.best_score_

-60047.34870149944

In [63]:
dec_search.best_params_

{'selectkbest__k': 5,
 'decisiontreeregressor__max_depth': 4,
 'decisiontreeregressor__criterion': 'squared_error',
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

## CSV FILE FOR DECISION TREE

In [64]:
url = "https://drive.google.com/file/d/1jnn7sVeWjrKyWe2DDkpbtGpM-vCmWsnW/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)

In [None]:
my_test_X = competition_data.drop(columns=["Id"])

In [None]:
my_submission_dec = pd.DataFrame(competition_data["Id"])
my_submission_dec["SalePrice"] = dec_search.predict(my_test_X)

my_submission_dec.to_csv('my_submission_dec.csv', index=False)

# Extras needed on colab
from google.colab import files
files.download("my_submission_dec.csv")

# RANDOM FOREST

In [65]:
from sklearn.ensemble import RandomForestRegressor

In [68]:
rforest_full_pipe = make_pipeline(full_preprocessing, 
                                 StandardScaler(with_mean=False),
                                 SelectKBest(score_func=f_regression),
                                 VarianceThreshold(),
                                 RandomForestRegressor()
                             )


# create parameter grid
param_grid = {
    "randomforestregressor__n_estimators": [100, 200],
    "randomforestregressor__max_depth": range(2, 14),
    "randomforestregressor__min_samples_leaf": range(2, 10),
    "randomforestregressor__criterion":["squared_error", "absolute_error", "poisson"],
    "selectkbest__k": range(5, 85, 5)
}

# define cross validation
rforest_search = RandomizedSearchCV(rforest_full_pipe,
                                    param_grid,
                                    cv=5,
                                    n_iter = 100,
                                    scoring='neg_root_mean_squared_error',
                                    verbose=1
                                    )

# fit
rforest_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
 

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='constant'))]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF',...
                                              RandomForestRegressor())]),
                   n_iter=100,
                   param_distributions={'randomforestregressor__criterion': ['squared_error',
                                                                             'absolute_error',
   

In [69]:
gb_search.best_score_

-45737.20712689988

In [70]:
gb_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'gradientboostingregressor__criterion': 'friedman_mse',
 'gradientboostingregressor__loss': 'huber',
 'gradientboostingregressor__n_estimators': 130,
 'selectkbest__k': 45}

# XG BOOSTER

In [71]:
from xgboost import XGBRegressor

In [80]:
xgb_full_pipe = make_pipeline(full_preprocessing, 
                                 StandardScaler(with_mean=False),
                                 SelectKBest(score_func=f_regression),
                                 VarianceThreshold(),
                                 XGBRegressor()
                             )


# create parameter grid
param_grid = {
   "xgbregressor__n_estimators": [100, 200],
    "xgbregressor__max_depth": range(2, 14, 2),
    "xgbregressor__max_leafs": range(2, 12, 2),
    "xgbregressor__eval_metric":["rmse", "poisson-nloglik"],
    "selectkbest__k": range(5, 85, 5)
}

# define cross validation
xgb_search = RandomizedSearchCV(xgb_full_pipe,
                                    param_grid,
                                    cv=5,
                                    n_iter = 50,
                                    scoring='neg_root_mean_squared_error',
                                    verbose=1
                                    )

# fit
xgb_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms




  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom




RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num_pipe',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='constant'))]),
                                                                               Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF',...
                                             ('variancethreshold',
                                              VarianceThreshold()),
                                             ('xgbregressor', XGBRegressor())]),
                   n_iter=50,
                   param_distributions={'selectkbes

In [81]:
xgb_search.best_score_

-48083.72843497737

In [82]:
xgb_search.best_params_

{'xgbregressor__n_estimators': 100,
 'xgbregressor__max_leafs': 2,
 'xgbregressor__max_depth': 2,
 'xgbregressor__eval_metric': 'rmse',
 'selectkbest__k': 15}

In [83]:
url = "https://drive.google.com/file/d/1jnn7sVeWjrKyWe2DDkpbtGpM-vCmWsnW/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
competition_data = pd.read_csv(path)
my_test_X = competition_data.drop(columns=["Id"])

In [84]:
my_submission_dec = pd.DataFrame(competition_data["Id"])
my_submission_dec["SalePrice"] = dec_search.predict(my_test_X)

my_submission_dec.to_csv('my_submission_dec.csv', index=False)

# Extras needed on colab
from google.colab import files
files.download("my_submission_dec.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>