In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')

In [None]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
def num_list(df):
  return list(set(train.select_dtypes(exclude='object').columns.tolist())-{'Id','SalePrice'})

In [None]:
def cat_list(df):
   return [col for col in df.columns if df[col].dtype=='object']

In [None]:
def drop_Columns(df,col):
   df.drop(col, axis=1, inplace=True)

In [None]:
def fills_missing_catvalue(df):
    for col in cat_list(df):
       df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
def fills_missing_numvalue(df):
  for col in num_list(df):
    df[col] = df[col].fillna(df[col].mean())

In [None]:
missing_value = pd.DataFrame({
        'Column Name' : train.columns,
        'Data Types'  : train.dtypes.values,
        'Null Point'  : train.isnull().sum()
})

missing_value[missing_value['Null Point'] > 0].sort_values(by=['Data Types', 'Null Point'])

Unnamed: 0,Column Name,Data Types,Null Point
MasVnrArea,MasVnrArea,float64,8
GarageYrBlt,GarageYrBlt,float64,81
LotFrontage,LotFrontage,float64,259
Electrical,Electrical,object,1
MasVnrType,MasVnrType,object,8
BsmtQual,BsmtQual,object,37
BsmtCond,BsmtCond,object,37
BsmtFinType1,BsmtFinType1,object,37
BsmtExposure,BsmtExposure,object,38
BsmtFinType2,BsmtFinType2,object,38


In [None]:
drop_Columns(train, 'Id')

In [None]:
fills_missing_numvalue(train)

In [None]:
fills_missing_catvalue(train)

In [None]:
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal,250000


In [None]:
numerical = num_list(train)

In [None]:
correlations = train[numerical].apply(lambda x: x.corr(train['SalePrice']))
print(correlations)

TotalBsmtSF      0.613581
BsmtHalfBath    -0.016844
MoSold           0.046432
LotFrontage      0.334901
GrLivArea        0.708624
KitchenAbvGr    -0.135907
BsmtUnfSF        0.214479
GarageArea       0.623431
1stFlrSF         0.605852
LowQualFinSF    -0.025606
FullBath         0.560664
2ndFlrSF         0.319334
3SsnPorch        0.044584
ScreenPorch      0.111447
YearBuilt        0.522897
OverallCond     -0.077856
EnclosedPorch   -0.128578
MSSubClass      -0.084284
PoolArea         0.092404
BsmtFinSF2      -0.011378
YearRemodAdd     0.507101
BsmtFinSF1       0.386420
BedroomAbvGr     0.168213
TotRmsAbvGrd     0.533723
OverallQual      0.790982
GarageYrBlt      0.470177
WoodDeckSF       0.324413
Fireplaces       0.466929
GarageCars       0.640409
MiscVal         -0.021190
BsmtFullBath     0.227122
LotArea          0.263843
YrSold          -0.028923
OpenPorchSF      0.315856
HalfBath         0.284108
MasVnrArea       0.475241
dtype: float64


In [None]:
drop_Columns(train,correlations[correlations < 0.2].index)

In [None]:
unique_count = pd.DataFrame(
    {
        'Column Name' : train.columns,
        'Data Types'  : train.dtypes.values,
        'Unique Count'  : train.nunique().values
    }
)

unique_count[unique_count['Data Types'] == 'object'][['Column Name', 'Unique Count']]

Unnamed: 0,Column Name,Unique Count
0,MSZoning,5
3,Street,2
4,Alley,2
5,LotShape,4
6,LandContour,4
7,Utilities,2
8,LotConfig,5
9,LandSlope,3
10,Neighborhood,25
11,Condition1,9


In [None]:
categorical = cat_list(train)

In [None]:
le = LabelEncoder()
for col in categorical:
  train[col] = le.fit_transform(train[col])

In [None]:
train[categorical]

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,3,1,0,3,3,0,4,0,5,2,...,1,1,4,4,2,2,2,2,8,4
1,3,1,0,3,3,0,2,0,24,1,...,1,1,4,4,2,2,2,2,8,4
2,3,1,0,0,3,0,4,0,5,2,...,1,1,4,4,2,2,2,2,8,4
3,3,1,0,0,3,0,0,0,6,2,...,5,2,4,4,2,2,2,2,8,0
4,3,1,0,0,3,0,2,0,15,2,...,1,1,4,4,2,2,2,2,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,3,1,0,3,3,0,4,0,8,2,...,1,1,4,4,2,2,2,2,8,4
1456,3,1,0,3,3,0,4,0,14,2,...,1,2,4,4,2,2,2,2,8,4
1457,3,1,0,3,3,0,4,0,6,2,...,1,1,4,4,2,2,0,2,8,4
1458,3,1,0,3,3,0,4,0,12,2,...,1,2,4,4,2,2,2,2,8,4


In [None]:
test = pd.read_csv('test.csv')

In [None]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal




In [None]:
def num_list_test(df):
  return list(set(test.select_dtypes(exclude='object').columns.tolist())-{'Id'})

In [None]:
def fills_missing_numvalue_test(df):
  for col in num_list_test(df):
    df[col] = df[col].fillna(df[col].mean())

In [None]:
missing_value = pd.DataFrame({
        'Column Name' : test.columns,
        'Data Types'  : test.dtypes.values,
        'Null Point'  : test.isnull().sum()
})

missing_value[missing_value['Null Point'] > 0].sort_values(by=['Data Types', 'Null Point'])

Unnamed: 0,Column Name,Data Types,Null Point


In [None]:
fills_missing_catvalue(test)

In [None]:
fills_missing_numvalue_test(test)

In [None]:
drop_Columns(test, 'BsmtHalfBath')

In [None]:
drop_Columns(test, 'MoSold')

In [None]:
drop_Columns(test, 'KitchenAbvGr')

In [None]:
drop_Columns(test, 'LowQualFinSF')

In [None]:
drop_Columns(test, '3SsnPorch')

In [None]:
drop_Columns(test, 'ScreenPorch')

In [None]:
drop_Columns(test, 'OverallCond')

In [None]:
drop_Columns(test, 'EnclosedPorch')

In [None]:
drop_Columns(test, 'MSSubClass')

In [None]:
drop_Columns(test, 'PoolArea')

In [None]:
drop_Columns(test, 'BsmtFinSF2')

In [None]:
drop_Columns(test, 'BedroomAbvGr')

In [None]:
drop_Columns(test, 'MiscVal')

In [None]:
drop_Columns(test, 'YrSold')

In [None]:
categorical_test = cat_list(test)

In [None]:
le = LabelEncoder()
for col in categorical_test:
  test[col] = le.fit_transform(test[col])

In [None]:
test[categorical_test]

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,2,1,0,3,3,0,4,0,12,1,...,1,2,3,4,2,0,2,2,8,4
1,3,1,0,0,3,0,0,0,12,2,...,1,2,3,4,2,0,2,0,8,4
2,3,1,0,0,3,0,4,0,8,2,...,1,0,3,4,2,0,2,2,8,4
3,3,1,0,0,3,0,4,0,8,2,...,1,0,3,4,2,0,2,2,8,4
4,3,1,0,0,1,0,4,0,22,2,...,1,1,3,4,2,0,2,2,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,4,1,0,3,3,0,4,0,10,2,...,1,2,3,4,2,0,2,2,8,4
1455,4,1,0,3,3,0,4,0,10,2,...,4,2,3,4,2,0,2,2,8,0
1456,3,1,0,3,3,0,4,0,11,2,...,5,2,3,4,2,0,2,2,8,0
1457,3,1,0,3,3,0,4,0,11,2,...,1,2,3,4,2,0,2,2,8,4


**Modelling**

In [None]:
test.shape

(1459, 66)

In [None]:
train.shape

(1460, 66)

In [None]:
X_train = train.drop('SalePrice', axis=1)
y_train = train['SalePrice']


with parameters

In [None]:
hist_model = HistGradientBoostingRegressor(loss='squared_error', learning_rate=0.1,
                                           max_iter=100, max_leaf_nodes=31, random_state=42)

hist_model.fit(X_train, y_train)
hist_model.score(X_train, y_train)

0.9799333062025563

In [None]:
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=7,
                             eta=0.1, subsample=0.7)
xgb_model.fit(X_train, y_train)
xgb_model.score(X_train,y_train)

0.9963028237481739

In [None]:
gradient_model = GradientBoostingRegressor(loss='squared_error', learning_rate=0.1,
                                           n_estimators=100, random_state=42)
gradient_model.fit(X_train, y_train)
gradient_model.score(X_train,y_train)

0.9623487684282996

In [None]:
forest_model = RandomForestRegressor(n_estimators=100, criterion='squared_error',
                                     random_state=42, n_jobs=1)
forest_model.fit(X_train, y_train)
forest_model.score(X_train,y_train)

0.9812918262431861

In [None]:
neighbor_model = KNeighborsRegressor(n_neighbors=5, weights='distance',
                                     algorithm='brute', n_jobs=1)
neighbor_model.fit(X_train,y_train)
neighbor_model.score(X_train,y_train)

0.9999969455613761

In [None]:
linear_model = LinearRegression(fit_intercept=True)
linear_model.fit(X_train, y_train)
linear_model.score(X_train,y_train)

0.8474481510796218

In [None]:
tree_model = DecisionTreeRegressor(criterion='squared_error', splitter='best',
                                   random_state=42)
tree_model.fit(X_train, y_train)
tree_model.score(X_train, y_train)

0.9999969455613789

**Prediction HistGradientRegressor**

In [None]:
X_test = test.drop('Id', axis=1)
pred = hist_model.predict(X_test)

In [None]:
df_test = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'Id' : df_test['Id'],
    'SalePrice' : pred
})

In [None]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,127099.542235
1,1462,155978.825374
2,1463,178427.019815
3,1464,182167.100265
4,1465,199503.510041


In [None]:
submission.to_csv('SubmitHistwithparam.csv', index=False)

**PREDICTION XGBREGRESSOR**

In [None]:
pred = xgb_model.predict(X_test)

In [None]:
df_test = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'Id' : df_test['Id'],
    'SalePrice' : pred
})

In [None]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,129554.453125
1,1462,154730.734375
2,1463,182473.5
3,1464,187700.53125
4,1465,185227.625


In [None]:
submission.to_csv('Submitxgbwithparam.csv', index=False)

**Prediction RandomForestRegressor**

In [None]:
pred = forest_model.predict(X_test)

In [None]:
df_test = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'Id' : df_test['Id'],
    'SalePrice' : pred
})

In [None]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,122621.32
1,1462,156010.9
2,1463,180904.19
3,1464,179829.9
4,1465,199589.97


In [None]:
submission.to_csv('SubmitForesttwithparam.csv', index=False)

**Prediction KNeighborRegressor**

In [None]:
pred = neighbor_model.predict(X_test)

In [None]:
df_test = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'Id' : df_test['Id'],
    'SalePrice' : pred
})

In [None]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,142148.346297
1,1462,176653.734501
2,1463,176327.509244
3,1464,182782.857877
4,1465,142784.20832


In [None]:
submission.to_csv('SubmitKNeightwithparam.csv', index=False)

**Prediction DecisionTreeRegressor**

In [None]:
pred = tree_model.predict(X_test)

In [None]:
df_test = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'Id' : df_test['Id'],
    'SalePrice' : pred
})

In [None]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,129000.0
1,1462,157000.0
2,1463,167500.0
3,1464,187500.0
4,1465,220000.0


In [None]:
submission.to_csv('SubmitTreewithparam.csv', index=False)