In [605]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [606]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [607]:
train.shape

(1460, 81)

In [608]:
test.shape

(1459, 80)

In [609]:
df = pd.concat([train, test], axis=0, ignore_index=True)
df.shape

(2919, 81)

In [610]:
df = df.drop('Id', axis=1)

In [611]:
df_cat = df.select_dtypes('object')
df_cat = df.select_dtypes('object')
missing = (100 * (df_cat.isna().sum())/len(df_cat)).sort_values(ascending=False)
large_missing_vals = [col for col in missing.index if missing[col] > 30]
large_missing_vals

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu']

In [612]:
df = df.drop(large_missing_vals, axis=1)

In [613]:
df.shape

(2919, 74)

In [614]:
important_cat = ["MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope"]

In [615]:
df.isna().sum()

MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
Street              0
                 ... 
MoSold              0
YrSold              0
SaleType            1
SaleCondition       0
SalePrice        1459
Length: 74, dtype: int64

In [616]:
df_cat_cols = df.select_dtypes('object')

In [617]:
for col in df_cat_cols.columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [618]:
df_num = df.select_dtypes(exclude='object')
df_num.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,61,0,0,0,0,0,2,2008,208500.0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,298,0,0,0,0,0,0,5,2007,181500.0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,42,0,0,0,0,0,9,2008,223500.0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,35,272,0,0,0,0,2,2006,140000.0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,192,84,0,0,0,0,0,12,2008,250000.0


In [619]:
for col in df_num.columns:
    df[col] = df[col].fillna(df[col].mean())

In [620]:
df.isna().sum().sort_values(ascending=False)

MSSubClass      0
GarageYrBlt     0
Fireplaces      0
Functional      0
TotRmsAbvGrd    0
               ..
ExterQual       0
MasVnrArea      0
Exterior2nd     0
Exterior1st     0
SalePrice       0
Length: 74, dtype: int64

In [621]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500.0
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500.0
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500.0
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000.0
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000.0


In [623]:
df = pd.get_dummies(df, dtype='int')
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [624]:
train_df = df.iloc[:1460, :]
train_df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1456,20,85.0,13175,6,6,1978,1988,119.0,790.0,163.0,...,0,0,0,1,0,0,0,0,1,0
1457,70,66.0,9042,7,9,1941,2006,0.0,275.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1458,20,68.0,9717,5,6,1950,1996,0.0,49.0,1029.0,...,0,0,0,1,0,0,0,0,1,0


In [625]:
test_df = df.iloc[1460:, :]
test_df

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1460,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,1,0,0,0,0,1,0
1461,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1462,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1463,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1464,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,21.0,1936,4,7,1970,1970,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2915,160,21.0,1894,4,5,1970,1970,0.0,252.0,0.0,...,0,0,0,1,1,0,0,0,0,0
2916,20,160.0,20000,5,7,1960,1996,0.0,1224.0,0.0,...,0,0,0,1,1,0,0,0,0,0
2917,85,62.0,10441,5,5,1992,1992,0.0,337.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [626]:
to_drop = ['GarageYrBlt','1stFlrSF','GarageArea','TotRmsAbvGrd']
train_df = train_df.drop(to_drop, axis=1)
test_df = test_df.drop(to_drop, axis=1)

In [627]:
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

In [628]:
# to_drop = ['GarageYrBlt','1stFlrSF','GarageArea','TotRmsAbvGrd']
# X = X.drop(to_drop, axis=1)
X.shape

(1460, 262)

In [629]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [630]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_prediction = lr.predict(X_test)

In [631]:
print('MAE: ', mean_absolute_error(y_test, lr_prediction))
print('MSE: ', mean_squared_error(y_test, lr_prediction))
print('R2 score', r2_score(y_test, lr_prediction))
print('RMSE:', np.sqrt(mean_squared_error(y_test, lr_prediction)))

MAE:  17987.724140119124
MSE:  762683215.9161973
R2 score 0.8907032146536342
RMSE: 27616.719861638117


In [632]:
from sklearn.ensemble import RandomForestRegressor
clf_random_tree = RandomForestRegressor(max_depth=30, random_state=1,n_estimators=2000)
clf_random_tree.fit(X_train,y_train)

random_pred = clf_random_tree.predict(X_test)

In [633]:
print('MAE: ', mean_absolute_error(y_test, random_pred))
print('MSE: ', mean_squared_error(y_test, random_pred))
print('R2 score', r2_score(y_test, random_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, random_pred)))

MAE:  16876.252687214615
MSE:  715501595.6332835
R2 score 0.8974646056436806
RMSE: 26748.861576397667


In [579]:
# from sklearn.model_selection import train_test_split
# from catboost import CatBoostRegressor


# # Define CatBoostRegressor model
# clf_catboost = CatBoostRegressor(iterations=1000,
#                                  learning_rate=0.08251,
#                                  depth=5,
#                                  l2_leaf_reg=3,
#                                  loss_function='RMSE',
#                                  random_seed=0)

# # Train the model
# clf_catboost.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

# # Predict on test data
# catboost_pred = clf_catboost.predict(X_test)

0:	learn: 74297.0430622	test: 79963.9572426	best: 79963.9572426 (0)	total: 2.99ms	remaining: 2.99s
100:	learn: 18389.4692492	test: 25237.8006244	best: 25237.8006244 (100)	total: 235ms	remaining: 2.09s
200:	learn: 13636.0907678	test: 23586.6713156	best: 23586.4916700 (199)	total: 464ms	remaining: 1.84s
300:	learn: 10865.1295462	test: 22965.5184778	best: 22965.5184778 (300)	total: 692ms	remaining: 1.61s
400:	learn: 8967.6077210	test: 22753.1887348	best: 22753.1887348 (400)	total: 918ms	remaining: 1.37s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 22721.88222
bestIteration = 408

Shrink model to first 409 iterations.


In [634]:
#for test split 0.3 is the best !!!
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor


# Define CatBoostRegressor model
clf_catboost = CatBoostRegressor(iterations=1000,
                                 learning_rate=0.0734305,
                                 depth=4,
                                 l2_leaf_reg=1,
                                 loss_function='RMSE',
                                 random_seed=0)

# Train the model
clf_catboost.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

# Predict on test data
catboost_pred = clf_catboost.predict(X_test)

0:	learn: 74007.9338405	test: 79851.0093219	best: 79851.0093219 (0)	total: 2.1ms	remaining: 2.09s
100:	learn: 18990.6687600	test: 25386.5026044	best: 25386.5026044 (100)	total: 191ms	remaining: 1.7s
200:	learn: 14426.4379354	test: 23205.7853533	best: 23192.7789424 (199)	total: 401ms	remaining: 1.59s
300:	learn: 11824.1220458	test: 22784.5867759	best: 22777.2651832 (297)	total: 580ms	remaining: 1.35s
400:	learn: 10002.5656278	test: 22514.5531493	best: 22514.5531493 (400)	total: 754ms	remaining: 1.13s
500:	learn: 8634.1597836	test: 22355.8681934	best: 22355.8681934 (500)	total: 923ms	remaining: 920ms
600:	learn: 7600.1621371	test: 22314.5893817	best: 22312.1170395 (592)	total: 1.09s	remaining: 724ms
700:	learn: 6711.0394559	test: 22265.0877668	best: 22264.6371954 (699)	total: 1.26s	remaining: 537ms
800:	learn: 5996.6159770	test: 22220.6564800	best: 22217.0333964 (794)	total: 1.43s	remaining: 355ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 22214.72369
bestIteration

In [635]:
print('MAE: ', mean_absolute_error(y_test, catboost_pred))
print('MSE: ', mean_squared_error(y_test, catboost_pred))
print('R2 score', r2_score(y_test, catboost_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, catboost_pred)))

MAE:  14141.08012134367
MSE:  493493948.80778575
R2 score 0.929279547435981
RMSE: 22214.72369415802


In [636]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [637]:
print('MAE: ', mean_absolute_error(y_test, xgb_pred))
print('MSE: ', mean_squared_error(y_test, xgb_pred))
print('R2 score', r2_score(y_test, xgb_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, xgb_pred)))

MAE:  15808.30698844178
MSE:  615058127.9477854
R2 score 0.9118587182948665
RMSE: 24800.365480125194


In [638]:
test_df = test_df.drop('SalePrice', axis=1)

In [639]:
test_df.shape

(1459, 262)

In [642]:
test_id = test['Id']
test_id
test_predictions_catboost = clf_catboost.predict(test_df)
# test_predictions_catboost
test_predictions_catboost_df = pd.DataFrame(test_predictions_catboost, columns=['SalePrice'])
submit_df = pd.concat([test_id, test_predictions_catboost_df], axis=1)
submit_df.to_csv('submission.csv', index=False)

In [643]:
submit_df.head()

Unnamed: 0,Id,SalePrice
0,1461,124814.03034
1,1462,163880.708971
2,1463,189442.214266
3,1464,185266.379218
4,1465,198892.851707
