# Model Training

Importing the Required Packages

In [79]:
# Basic Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute  import SimpleImputer
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings
warnings.simplefilter(action= 'ignore')


Importing Train Data

In [80]:
df = pd.read_csv('data/train.csv')

Reading Top 5 Rows of Dataset

In [81]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Checking the Dataset Shape

In [82]:
df.shape

(1460, 81)

Checking the Different Variables in every Features

In [83]:
for features in df.select_dtypes(include="object").columns:
    print(features, len(df[features].unique()))

MSZoning 5
Street 2
Alley 3
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 8
BldgType 5
HouseStyle 8
RoofStyle 6
RoofMatl 8
Exterior1st 15
Exterior2nd 16
MasVnrType 4
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 5
BsmtCond 5
BsmtExposure 5
BsmtFinType1 7
BsmtFinType2 7
Heating 6
HeatingQC 5
CentralAir 2
Electrical 6
KitchenQual 4
Functional 7
FireplaceQu 6
GarageType 7
GarageFinish 4
GarageQual 6
GarageCond 6
PavedDrive 3
PoolQC 4
Fence 5
MiscFeature 5
SaleType 9
SaleCondition 6


In [84]:
for features in df.select_dtypes(include="object").columns:
    print("different Categories in",features,"variables", df[features].unique())

different Categories in MSZoning variables ['RL' 'RM' 'C (all)' 'FV' 'RH']
different Categories in Street variables ['Pave' 'Grvl']
different Categories in Alley variables [nan 'Grvl' 'Pave']
different Categories in LotShape variables ['Reg' 'IR1' 'IR2' 'IR3']
different Categories in LandContour variables ['Lvl' 'Bnk' 'Low' 'HLS']
different Categories in Utilities variables ['AllPub' 'NoSeWa']
different Categories in LotConfig variables ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
different Categories in LandSlope variables ['Gtl' 'Mod' 'Sev']
different Categories in Neighborhood variables ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
different Categories in Condition1 variables ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
different Categories in Condition2 variables ['

Checking the missing values in every clolumn

In [85]:
a = df.columns
b = df.isna().sum()
print(list(zip(a, b)))
for na in list(zip(a,b)):
    if na[1] > 500:
        print(na)

[('Id', 0), ('MSSubClass', 0), ('MSZoning', 0), ('LotFrontage', 259), ('LotArea', 0), ('Street', 0), ('Alley', 1369), ('LotShape', 0), ('LandContour', 0), ('Utilities', 0), ('LotConfig', 0), ('LandSlope', 0), ('Neighborhood', 0), ('Condition1', 0), ('Condition2', 0), ('BldgType', 0), ('HouseStyle', 0), ('OverallQual', 0), ('OverallCond', 0), ('YearBuilt', 0), ('YearRemodAdd', 0), ('RoofStyle', 0), ('RoofMatl', 0), ('Exterior1st', 0), ('Exterior2nd', 0), ('MasVnrType', 872), ('MasVnrArea', 8), ('ExterQual', 0), ('ExterCond', 0), ('Foundation', 0), ('BsmtQual', 37), ('BsmtCond', 37), ('BsmtExposure', 38), ('BsmtFinType1', 37), ('BsmtFinSF1', 0), ('BsmtFinType2', 38), ('BsmtFinSF2', 0), ('BsmtUnfSF', 0), ('TotalBsmtSF', 0), ('Heating', 0), ('HeatingQC', 0), ('CentralAir', 0), ('Electrical', 1), ('1stFlrSF', 0), ('2ndFlrSF', 0), ('LowQualFinSF', 0), ('GrLivArea', 0), ('BsmtFullBath', 0), ('BsmtHalfBath', 0), ('FullBath', 0), ('HalfBath', 0), ('BedroomAbvGr', 0), ('KitchenAbvGr', 0), ('Kitc

Handling the Missing Values in Dataset

In [86]:
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())
df['MasVnrArea'] = df['MasVnrArea'].fillna(df['MasVnrArea'].mean())
df['BsmtQual'] = df['BsmtQual'].fillna(df['BsmtQual'].mode()[0])
df['BsmtCond'] = df['BsmtCond'].fillna(df['BsmtCond'].mode()[0])
df['BsmtExposure'] = df['BsmtExposure'].fillna(df['BsmtExposure'].mode()[0])
df['BsmtFinType1'] = df['BsmtFinType1'].fillna(df['BsmtFinType1'].mode()[0])
df['BsmtFinType2'] = df['BsmtFinType2'].fillna(df['BsmtFinType2'].mode()[0])
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
df['GarageType'] = df['GarageType'].fillna(df['GarageType'].mode()[0])
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean())
df['GarageFinish'] = df['GarageFinish'].fillna(df['GarageFinish'].mode()[0])
df['GarageQual'] = df['GarageQual'].fillna(df['GarageQual'].mode()[0])
df['GarageCond'] = df['GarageCond'].fillna(df['GarageCond'].mode()[0])


Checking for the missing values

In [87]:
a = df.columns
b = df.isna().sum()
print(list(zip(a, b)))
for na in list(zip(a,b)):
    if na[1] > 500:
        print(na)


[('Id', 0), ('MSSubClass', 0), ('MSZoning', 0), ('LotFrontage', 0), ('LotArea', 0), ('Street', 0), ('Alley', 1369), ('LotShape', 0), ('LandContour', 0), ('Utilities', 0), ('LotConfig', 0), ('LandSlope', 0), ('Neighborhood', 0), ('Condition1', 0), ('Condition2', 0), ('BldgType', 0), ('HouseStyle', 0), ('OverallQual', 0), ('OverallCond', 0), ('YearBuilt', 0), ('YearRemodAdd', 0), ('RoofStyle', 0), ('RoofMatl', 0), ('Exterior1st', 0), ('Exterior2nd', 0), ('MasVnrType', 872), ('MasVnrArea', 0), ('ExterQual', 0), ('ExterCond', 0), ('Foundation', 0), ('BsmtQual', 0), ('BsmtCond', 0), ('BsmtExposure', 0), ('BsmtFinType1', 0), ('BsmtFinSF1', 0), ('BsmtFinType2', 0), ('BsmtFinSF2', 0), ('BsmtUnfSF', 0), ('TotalBsmtSF', 0), ('Heating', 0), ('HeatingQC', 0), ('CentralAir', 0), ('Electrical', 0), ('1stFlrSF', 0), ('2ndFlrSF', 0), ('LowQualFinSF', 0), ('GrLivArea', 0), ('BsmtFullBath', 0), ('BsmtHalfBath', 0), ('FullBath', 0), ('HalfBath', 0), ('BedroomAbvGr', 0), ('KitchenAbvGr', 0), ('KitchenQual

Imporintg the Test Data

In [88]:
test_data = pd.read_csv('data/test.csv')

Reading top Rows

In [89]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


Shape of DataSet

In [90]:
test_data.shape

(1459, 80)

Checking different variables in every features

In [91]:
for features in test_data.select_dtypes(include="object").columns:
    print(features, len(test_data[features].unique()))

MSZoning 6
Street 2
Alley 3
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 5
BldgType 5
HouseStyle 7
RoofStyle 6
RoofMatl 4
Exterior1st 14
Exterior2nd 16
MasVnrType 4
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 5
BsmtCond 5
BsmtExposure 5
BsmtFinType1 7
BsmtFinType2 7
Heating 4
HeatingQC 5
CentralAir 2
Electrical 4
KitchenQual 5
Functional 8
FireplaceQu 6
GarageType 7
GarageFinish 4
GarageQual 5
GarageCond 6
PavedDrive 3
PoolQC 3
Fence 5
MiscFeature 4
SaleType 10
SaleCondition 6


In [92]:
for features in test_data.select_dtypes(include="object").columns:
    print("different Categories in",features,"variables", test_data[features].unique())

different Categories in MSZoning variables ['RH' 'RL' 'RM' 'FV' 'C (all)' nan]
different Categories in Street variables ['Pave' 'Grvl']
different Categories in Alley variables [nan 'Pave' 'Grvl']
different Categories in LotShape variables ['Reg' 'IR1' 'IR2' 'IR3']
different Categories in LandContour variables ['Lvl' 'HLS' 'Bnk' 'Low']
different Categories in Utilities variables ['AllPub' nan]
different Categories in LotConfig variables ['Inside' 'Corner' 'FR2' 'CulDSac' 'FR3']
different Categories in LandSlope variables ['Gtl' 'Mod' 'Sev']
different Categories in Neighborhood variables ['NAmes' 'Gilbert' 'StoneBr' 'BrDale' 'NPkVill' 'NridgHt' 'Blmngtn'
 'NoRidge' 'Somerst' 'SawyerW' 'Sawyer' 'NWAmes' 'OldTown' 'BrkSide'
 'ClearCr' 'SWISU' 'Edwards' 'CollgCr' 'Crawfor' 'Blueste' 'IDOTRR'
 'Mitchel' 'Timber' 'MeadowV' 'Veenker']
different Categories in Condition1 variables ['Feedr' 'Norm' 'PosN' 'RRNe' 'Artery' 'RRNn' 'PosA' 'RRAn' 'RRAe']
different Categories in Condition2 variables ['N

Checking Missing Vlaues in every column

In [93]:
a = test_data.columns
b = test_data.isna().sum()
print(list(zip(a,b)))
for na in list(zip(a, b)):
    if na[1] > 500:
        print(na)

[('Id', 0), ('MSSubClass', 0), ('MSZoning', 4), ('LotFrontage', 227), ('LotArea', 0), ('Street', 0), ('Alley', 1352), ('LotShape', 0), ('LandContour', 0), ('Utilities', 2), ('LotConfig', 0), ('LandSlope', 0), ('Neighborhood', 0), ('Condition1', 0), ('Condition2', 0), ('BldgType', 0), ('HouseStyle', 0), ('OverallQual', 0), ('OverallCond', 0), ('YearBuilt', 0), ('YearRemodAdd', 0), ('RoofStyle', 0), ('RoofMatl', 0), ('Exterior1st', 1), ('Exterior2nd', 1), ('MasVnrType', 894), ('MasVnrArea', 15), ('ExterQual', 0), ('ExterCond', 0), ('Foundation', 0), ('BsmtQual', 44), ('BsmtCond', 45), ('BsmtExposure', 44), ('BsmtFinType1', 42), ('BsmtFinSF1', 1), ('BsmtFinType2', 42), ('BsmtFinSF2', 1), ('BsmtUnfSF', 1), ('TotalBsmtSF', 1), ('Heating', 0), ('HeatingQC', 0), ('CentralAir', 0), ('Electrical', 0), ('1stFlrSF', 0), ('2ndFlrSF', 0), ('LowQualFinSF', 0), ('GrLivArea', 0), ('BsmtFullBath', 2), ('BsmtHalfBath', 2), ('FullBath', 0), ('HalfBath', 0), ('BedroomAbvGr', 0), ('KitchenAbvGr', 0), ('Kit

Handling the Missing values in Dataset

In [94]:
test_data['MSZoning'] = test_data['MSZoning'].fillna(value=test_data['MSZoning'].mode()[0])
test_data['LotFrontage'] = test_data['LotFrontage'].fillna(test_data['LotFrontage'].mean())
test_data['Utilities'] = test_data['Utilities'].fillna(test_data['Utilities'].mode()[0])
test_data['Exterior1st'] = test_data['Exterior1st'].fillna(test_data['Exterior1st'].mode()[0])
test_data['Exterior2nd'] = test_data['Exterior2nd'].fillna(test_data['Exterior2nd'].mode()[0])
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(test_data['MasVnrArea'].mean())
test_data['BsmtQual'] = test_data['BsmtQual'].fillna(test_data['BsmtQual'].mode()[0])
test_data['BsmtCond'] = test_data['BsmtCond'].fillna(test_data['BsmtCond'].mode()[0])
test_data['BsmtExposure'] = test_data['BsmtExposure'].fillna(test_data['BsmtExposure'].mode()[0])
test_data['BsmtFinType1'] = test_data['BsmtFinType1'].fillna(test_data['BsmtFinType1'].mode()[0])
test_data['BsmtFinSF1'] = test_data['BsmtFinSF1'].fillna(test_data['BsmtFinSF1'].mean())
test_data['BsmtFinType2'] = test_data['BsmtFinType2'].fillna(test_data['BsmtFinType2'].mode()[0])
test_data['BsmtFinSF2'] = test_data['BsmtFinSF2'].fillna(test_data['BsmtFinSF2'].mode()[0])
test_data['BsmtUnfSF'] = test_data['BsmtUnfSF'].fillna(test_data['BsmtUnfSF'].mean())
test_data['TotalBsmtSF'] = test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].mean())
test_data['BsmtFullBath'] = test_data['BsmtFullBath'].fillna(test_data['BsmtFullBath'].mean())
test_data['BsmtHalfBath'] = test_data['BsmtHalfBath'].fillna(test_data['BsmtHalfBath'].mean())
test_data['KitchenQual'] = test_data['KitchenQual'].fillna(test_data['KitchenQual'].mode()[0])
test_data['Functional'] = test_data['Functional'].fillna(test_data['Functional'].mode()[0])
test_data['GarageType'] = test_data['GarageType'].fillna(test_data['GarageType'].mode()[0])
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(test_data['GarageYrBlt'].mean())
test_data['GarageFinish'] = test_data['GarageFinish'].fillna(test_data['GarageFinish'].mode()[0])
test_data['GarageCars'] = test_data['GarageCars'].fillna(test_data['GarageCars'].mean())
test_data['GarageArea'] = test_data['GarageArea'].fillna(test_data['GarageArea'].mean())
test_data['GarageQual'] = test_data['GarageQual'].fillna(test_data['GarageQual'].mode()[0])
test_data['GarageCond'] = test_data['GarageCond'].fillna(test_data['GarageCond'].mode()[0])
test_data['SaleType'] = test_data['SaleType'].fillna(test_data['SaleType'].mode()[0])

Checking the Missing values

In [95]:
a = test_data.columns
b = test_data.isna().sum()
print(list(zip(a,b)))
for na in list(zip(a, b)):
    if na[1] > 500:
        print(na)

[('Id', 0), ('MSSubClass', 0), ('MSZoning', 0), ('LotFrontage', 0), ('LotArea', 0), ('Street', 0), ('Alley', 1352), ('LotShape', 0), ('LandContour', 0), ('Utilities', 0), ('LotConfig', 0), ('LandSlope', 0), ('Neighborhood', 0), ('Condition1', 0), ('Condition2', 0), ('BldgType', 0), ('HouseStyle', 0), ('OverallQual', 0), ('OverallCond', 0), ('YearBuilt', 0), ('YearRemodAdd', 0), ('RoofStyle', 0), ('RoofMatl', 0), ('Exterior1st', 0), ('Exterior2nd', 0), ('MasVnrType', 894), ('MasVnrArea', 0), ('ExterQual', 0), ('ExterCond', 0), ('Foundation', 0), ('BsmtQual', 0), ('BsmtCond', 0), ('BsmtExposure', 0), ('BsmtFinType1', 0), ('BsmtFinSF1', 0), ('BsmtFinType2', 0), ('BsmtFinSF2', 0), ('BsmtUnfSF', 0), ('TotalBsmtSF', 0), ('Heating', 0), ('HeatingQC', 0), ('CentralAir', 0), ('Electrical', 0), ('1stFlrSF', 0), ('2ndFlrSF', 0), ('LowQualFinSF', 0), ('GrLivArea', 0), ('BsmtFullBath', 0), ('BsmtHalfBath', 0), ('FullBath', 0), ('HalfBath', 0), ('BedroomAbvGr', 0), ('KitchenAbvGr', 0), ('KitchenQual

Creating the X and droping the Features with Missing values Greater than 500

In [96]:
X = df.drop(['Id', 'SalePrice', 'Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

shape of X after droping the Features

In [97]:
X.shape

(1460, 73)

Creating the test set and droping the features with missing values greater than 500

In [98]:
test_X = test_data.drop(['Id', 'Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

checking the shape of the Testset after droping the features

In [99]:
test_X.shape

(1459, 73)

Concateneting the Both Datasets for 

In [100]:
concat_set = pd.concat([X, test_X], axis=0)

In [101]:
concat_set.shape

(2919, 73)

In [102]:
ohc_set = pd.get_dummies(concat_set)

In [103]:
ohc_set.shape

(2919, 266)

In [104]:
dp_set = ohc_set.T.drop_duplicates().T

In [105]:
dp_set.shape

(2919, 266)

In [106]:
train_set = dp_set.iloc[:1460,:]
test_set = dp_set.iloc[1460:,:]

In [107]:
X = train_set
y = df['SalePrice']
test_X = test_set

In [108]:
X.shape

(1460, 266)

In [109]:
test_X.shape

(1459, 266)

Transforming the train data

In [110]:
"""cat = X.select_dtypes(include='object').columns
num = X.select_dtypes(exclude='object').columns

num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot_incoder', OneHotEncoder()),
        ('scaler', StandardScaler(with_mean=False))
    ]
)

preprocessor_1 = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, num),
        ('cat_piprline', cat_pipeline, cat)
    ]
)"""

"cat = X.select_dtypes(include='object').columns\nnum = X.select_dtypes(exclude='object').columns\n\nnum_pipeline = Pipeline(\n    steps=[\n        ('imputer', SimpleImputer(strategy='mean')),\n        ('scaler', StandardScaler())\n    ]\n)\n\ncat_pipeline = Pipeline(\n    steps=[\n        ('imputer', SimpleImputer(strategy='most_frequent')),\n        ('one_hot_incoder', OneHotEncoder()),\n        ('scaler', StandardScaler(with_mean=False))\n    ]\n)\n\npreprocessor_1 = ColumnTransformer(\n    [\n        ('num_pipeline', num_pipeline, num),\n        ('cat_piprline', cat_pipeline, cat)\n    ]\n)"

Transforming the test data

In [111]:
"""test_cat = test_X.select_dtypes(include='object').columns
test_num = test_X.select_dtypes(exclude='object').columns


num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('one_hot_encoder', OneHotEncoder()),
        ('scaler', StandardScaler(with_mean=False))
    ]
)

preprocessor_2 = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, test_num),
        ('cat_pipeline', cat_pipeline, test_cat),
        
    ]
)"""


"test_cat = test_X.select_dtypes(include='object').columns\ntest_num = test_X.select_dtypes(exclude='object').columns\n\n\nnum_pipeline = Pipeline(\n    steps=[\n        ('imputer', SimpleImputer(strategy='mean')),\n        ('scaler', StandardScaler())\n    ]\n)\n\ncat_pipeline = Pipeline(\n    steps= [\n        ('imputer', SimpleImputer(strategy='most_frequent')),\n        ('one_hot_encoder', OneHotEncoder()),\n        ('scaler', StandardScaler(with_mean=False))\n    ]\n)\n\npreprocessor_2 = ColumnTransformer(\n    [\n        ('num_pipeline', num_pipeline, test_num),\n        ('cat_pipeline', cat_pipeline, test_cat),\n        \n    ]\n)"

Concaneting data

Splitng the Data into Train and Test 

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((1022, 266), (438, 266))

Creating the Evaluation Function to give all metrics after model training

In [113]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    r2 = r2_score(true, predicted)
    return mae, r2

Finding the best Model for dataset

In [121]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    #"XGBRegressor": XGBRegressor(),
    "CatBoostRegressor": CatBoostRegressor(),
}

model_list = []
r2_list = []

# Training Modle
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

# Giving data for prediction
    pred = model.predict(X_test)

# Evaluate the train data set
    model_mae, model_r2 = evaluate_model(y_test, pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print(f"mean_absolute_error: {model_mae}")
    print(f"r2_score: {model_r2}")
    r2_list.append(model_r2)


LinearRegression
mean_absolute_error: 18044.995748499776
r2_score: 0.8889528343145544
DecisionTreeRegressor
mean_absolute_error: 27035.08904109589
r2_score: 0.6985010880054472
KNeighborsRegressor
mean_absolute_error: 28533.59863013699
r2_score: 0.6869902055160204
RandomForestRegressor
mean_absolute_error: 16910.01319634703
r2_score: 0.8914712589224467
AdaBoostRegressor
mean_absolute_error: 23822.86636567128
r2_score: 0.8403854517257777
Lasso
mean_absolute_error: 17507.690601302053
r2_score: 0.8988018505662816
Ridge
mean_absolute_error: 19004.443781815487
r2_score: 0.8848742524588291
Learning rate set to 0.041084
0:	learn: 75734.2149081	total: 17.4ms	remaining: 17.4s
1:	learn: 73765.8500610	total: 37.7ms	remaining: 18.8s
2:	learn: 72018.8837532	total: 55.8ms	remaining: 18.5s
3:	learn: 70328.8239700	total: 72.7ms	remaining: 18.1s
4:	learn: 68651.4107833	total: 88.6ms	remaining: 17.6s
5:	learn: 66901.1913028	total: 104ms	remaining: 17.2s
6:	learn: 65364.6695815	total: 118ms	remaining: 16.

In [115]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=["models", "r2_score"])

Unnamed: 0,models,r2_score
0,LinearRegression,0.888953
1,DecisionTreeRegressor,0.774631
2,KNeighborsRegressor,0.68699
3,RandomForestRegressor,0.889878
4,AdaBoostRegressor,0.84066
5,Lasso,0.898802
6,Ridge,0.884874
7,CatBoostRegressor,0.920575


In [116]:
pd.DataFrame({'Actual Value':y_test,'Predicted Value':pred,'Difference':y_test-pred})

Unnamed: 0,Actual Value,Predicted Value,Difference
892,154500,144942.922381,9557.077619
1105,325000,351089.396749,-26089.396749
413,115000,111996.391236,3003.608764
522,159000,151259.806397,7740.193603
1036,315500,330443.717546,-14943.717546
...,...,...,...
331,139000,134661.130203,4338.869797
323,126175,124447.570932,1727.429068
650,205950,207775.446068,-1825.446068
439,110000,130252.020028,-20252.020028


In [117]:
cbr = CatBoostRegressor()
cbr.fit(X_train, y_train)
pred = cbr.predict(X_test)
r2 = r2_score(y_test, pred)
mea = mean_absolute_error(y_test, pred)
print("r2_score: ", r2)
print("mean_absolute_error: ", mea)

"""
r2_score:  0.8981830914967855
mean_absolute_error:  16495.53817060199
"""
"""{'depth': 6, 'iterations': 100, 'learning_rate': 0.1}"""

Learning rate set to 0.041084
0:	learn: 75734.2149081	total: 17.5ms	remaining: 17.4s
1:	learn: 73765.8500610	total: 36.2ms	remaining: 18.1s
2:	learn: 72018.8837532	total: 60.3ms	remaining: 20s
3:	learn: 70328.8239700	total: 81.6ms	remaining: 20.3s
4:	learn: 68651.4107833	total: 103ms	remaining: 20.5s
5:	learn: 66901.1913028	total: 124ms	remaining: 20.5s
6:	learn: 65364.6695815	total: 140ms	remaining: 19.8s
7:	learn: 63751.0801589	total: 156ms	remaining: 19.3s
8:	learn: 62327.9052622	total: 171ms	remaining: 18.8s
9:	learn: 60874.0685331	total: 185ms	remaining: 18.3s
10:	learn: 59433.0744096	total: 201ms	remaining: 18s
11:	learn: 58057.9235083	total: 215ms	remaining: 17.7s
12:	learn: 56925.7475980	total: 230ms	remaining: 17.5s
13:	learn: 55812.6441432	total: 245ms	remaining: 17.3s
14:	learn: 54647.8035404	total: 256ms	remaining: 16.8s
15:	learn: 53703.4400201	total: 270ms	remaining: 16.6s
16:	learn: 52656.2076470	total: 285ms	remaining: 16.5s
17:	learn: 51655.5514009	total: 298ms	remaini

"{'depth': 6, 'iterations': 100, 'learning_rate': 0.1}"

In [118]:
paramater = {'depth':[3,1,2,4,5],
          'iterations':[50,100, 150, 200],
          'learning_rate':[0.03,0.001,0.01,0.1,], 
          'l2_leaf_reg':[3,1,5,10,50],
          'border_count':[32,5,10,20,50,],
          'loss_function': ['RME', 'RSME']}
"""
grid = GridSearchCV(estimator= cbr, param_grid= paramater, cv=2, scoring='accuracy')
grid.fit(X_train, y_train)
grid.best_estimator_
grid.best_params_"""

"\ngrid = GridSearchCV(estimator= cbr, param_grid= paramater, cv=2, scoring='accuracy')\ngrid.fit(X_train, y_train)\ngrid.best_estimator_\ngrid.best_params_"

In [119]:
pred = cbr.predict(test_X)
print(pred)

[125049.05522105 165444.55917865 186636.78400951 ... 156765.11625251
 119943.46553952 238128.85748449]


In [120]:
"""rs = pd.DataFrame(list(zip(test_data['Id'], pred)), columns=['Id', 'SalePrice'])
rs.to_csv('sample_submission3.csv', index=False)"""

"rs = pd.DataFrame(list(zip(test_data['Id'], pred)), columns=['Id', 'SalePrice'])\nrs.to_csv('sample_submission3.csv', index=False)"