* Using pipeline
* Using  RandomForestRegressor, GradientBoostingRegressor, CatBoostRegressor.

* **Importing** 

In [40]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


* **Read CSV files**

In [41]:
X_train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv", index_col= 'Id')
X_test = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv", index_col = 'Id')

In [42]:
X_train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


 * **Shape of the dataset**

In [43]:
X_train.shape 


(1460, 80)

In [44]:
X_test.shape


(1459, 79)

* **Preprocessing**
Remove rows with missing target, separate target from predictors

In [45]:
X_train.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = X_train.SalePrice
X_train.drop(["SalePrice"], axis=1, inplace=True)

**Break off validation set from training data**

In [66]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_train, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [67]:
X_train_full.shape, X_valid_full.shape, y_train.shape, y_valid.shape

((1168, 79), (292, 79), (1168,), (292,))

**"Cardinality" means the number of unique values in a column**

**Select categorical columns with relatively low cardinality (convenient but arbitrary)**

In [68]:
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

**Selecting numerical columns**

In [69]:
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

**select columns only**

In [70]:
my_cols = categorical_cols + numerical_cols
X_Train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_Test = X_test[my_cols].copy()

**Implement Pipeline**

In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

**Preprocessing for numerical data**

**Using SimpleImputer to fill all missing values in numerical columns**

In [72]:
numerical_transformer = SimpleImputer(strategy='constant')

**Preprocessing for categorical data**

In [73]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

**Bundle preprocessing for numerical and categorical data**

In [74]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

**Model : 1**

**CatBoostRegressor**

In [75]:
import catboost as cb
model1 = cb.CatBoostRegressor(loss_function='RMSE',random_state=42,verbose=False)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline1 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model1)
                             ])

# Preprocessing of training data, fit model 
my_pipeline1.fit(X_Train, y_train)

# Preprocessing of validation data, get predictions
preds1 = my_pipeline1.predict(X_valid)


**Model : 2**

**GradientBoostingRegressor**

In [76]:
from sklearn.ensemble import GradientBoostingRegressor
model2 = GradientBoostingRegressor(n_estimators=1000, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline2 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model2)
                             ])

# Preprocessing of training data, fit model 
my_pipeline2.fit(X_Train, y_train)

# Preprocessing of validation data, get predictions
preds2 = my_pipeline2.predict(X_valid)

**Model : 3**

 **RandomForestRegressor**

In [77]:
model3 = RandomForestRegressor(n_estimators=5000,random_state=42)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline3 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model3)
                             ])

# Preprocessing of training data, fit model 
my_pipeline3.fit(X_Train, y_train)

# Preprocessing of validation data, get predictions
preds3 = my_pipeline3.predict(X_valid)

**MAE of each model**

In [78]:
score = mean_absolute_error(y_valid, preds1)
print('MAE:', score)
score = mean_absolute_error(y_valid, preds2)
print('MAE:', score)
score = mean_absolute_error(y_valid, preds3)
print('MAE:', score)

MAE: 15323.553678422251
MAE: 15897.237760657878
MAE: 17309.65928630137


**Average of their predictions**

In [79]:
preds= (preds1+ preds2+ preds3)/3

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 15392.055543207098


**Predict Test set**

In [80]:
# Preprocessing of test data, fit model
preds_test1 = my_pipeline1.predict(X_test)
preds_test2 = my_pipeline2.predict(X_test)
preds_test3 = my_pipeline3.predict(X_test)
preds_test = (preds_test1+preds_test2+ preds_test3 )/3



In [81]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test3})
output.to_csv('sn.csv', index=False)