# HousePrice -My Solution

#### I stuck to pre-processing such as missing value completion, outlier removal, and logarithmic transformation. 

#### I used the yeo-johnson transformation, which resulted in an improved score. 

#### I used ElasticNet for the model and the final score was 0.13202.

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

import scipy 
from scipy.stats import norm

from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# EDA

In [None]:
PATH = '/kaggle/input/house-prices-advanced-regression-techniques/'

train  = pd.read_csv(PATH + 'train.csv')
test   = pd.read_csv(PATH + 'test.csv')
sample = pd.read_csv(PATH + 'sample_submission.csv')

#### In regression, the objective variable needs to follow a normal distribution, which was achieved by applying a log transformation to "SalePrice".

In [None]:
train['SalePrice'] = np.log1p(train['SalePrice'])

In [None]:
plt.figure(figsize=(10,4))
sns.distplot(train['SalePrice'],fit=norm,fit_kws={'color':'tomato','label':'norm'})
plt.ylabel('')
plt.legend()
plt.show()

#### Some of the missing values make sense; I read the description and completed them one by one.

In [None]:
train['LotFrontage']   =  train['LotFrontage'].fillna(0.0)
train['Alley']         =  train['Alley'].fillna('NaN')
train['BsmtQual']      =  train['BsmtQual'].fillna('NaN')
train['BsmtCond']      =  train['BsmtCond'].fillna('NaN')
train['BsmtExposure']  =  train['BsmtExposure'].fillna('NaN')
train['BsmtFinType1']  =  train['BsmtFinType1'].fillna('NaN')
train['BsmtFinType2']  =  train['BsmtFinType2'].fillna('NaN')
train['FireplaceQu']   =  train['FireplaceQu'].fillna('NaN')
train['GarageType']    =  train['GarageType'].fillna('NaN')
train['GarageYrBlt']   =  train['GarageYrBlt'].fillna(0)
train['GarageFinish']  =  train['GarageFinish'].fillna('NaN')
train['GarageQual']    =  train['GarageQual'].fillna('NaN')
train['GarageCond']    =  train['GarageCond'].fillna('NaN')
train['PoolQC']        =  train['PoolQC'].fillna('NaN')
train['Fence']         =  train['Fence'].fillna('NaN')
train['MiscFeature']   =  train['MiscFeature'].fillna('NaN')
train['MasVnrArea']    =  train['MasVnrArea'].fillna(0.0)

In [None]:
test['LotFrontage']    =  test['LotFrontage'].fillna(0.0)
test['Alley']          =  test['Alley'].fillna('NaN')
test['BsmtQual']       =  test['BsmtQual'].fillna('NaN')
test['BsmtCond']       =  test['BsmtCond'].fillna('NaN')
test['BsmtExposure']   =  test['BsmtExposure'].fillna('NaN')
test['BsmtFinType1']   =  test['BsmtFinType1'].fillna('NaN')
test['BsmtFinType2']   =  test['BsmtFinType2'].fillna('NaN')
test['FireplaceQu']    =  test['FireplaceQu'].fillna('NaN')
test['GarageType']     =  test['GarageType'].fillna('NaN')
test['GarageYrBlt']    =  test['GarageYrBlt'].fillna(0)
test['GarageFinish']   =  test['GarageFinish'].fillna('NaN')
test['GarageQual']     =  test['GarageQual'].fillna('NaN')
test['GarageCond']     =  test['GarageCond'].fillna('NaN')
test['PoolQC']         =  test['PoolQC'].fillna('NaN')
test['Fence']          =  test['Fence'].fillna('NaN')
test['MiscFeature']    =  test['MiscFeature'].fillna('NaN')
test['MasVnrArea']     =  test['MasVnrArea'].fillna(0.0)


test['BsmtFinSF1']     =  test['BsmtFinSF1'].fillna(0.0) 
test['TotalBsmtSF']    =  test['TotalBsmtSF'].fillna(0.0)

In [None]:
# The correlation coefficient between "GarageArea" and "GarageType" is high, 
# so I completed the missing values based on the information "GarageType".

train_detchd = train[train['GarageType'] == 'Detchd']
test_detchd  = test[test['GarageType']   == 'Detchd']

In [None]:
train['GarageArea'] = train['GarageArea'].fillna(train_detchd['GarageArea'].mean())



test['GarageArea']  = test['GarageArea'].fillna(test_detchd['GarageArea'].mean())

#### I removed those with missing values and low correlation coefficients with "SalePrice"

In [None]:
np.abs(train.corr()['SalePrice']).sort_values(ascending=True).head(15)

In [None]:
test.isnull().sum()[test.isnull().sum() > 0]

In [None]:
train.drop(['Id','MSZoning','Utilities','Exterior1st','Exterior2nd','MasVnrType','Electrical','BsmtFinSF2','BsmtUnfSF','BsmtFullBath','BsmtHalfBath','Functional','SaleType'], axis=1, inplace=True)



test.drop(['Id','MSZoning','Utilities','Exterior1st','Exterior2nd','MasVnrType','Electrical','BsmtFinSF2','BsmtUnfSF','BsmtFullBath','BsmtHalfBath','Functional','SaleType'], axis=1, inplace=True)

In [None]:
train.dtypes[train.dtypes == 'object'].index

In [None]:
# Label Encoding

oe = OrdinalEncoder()
encoded = oe.fit_transform(

train[['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']].values)

train[['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']] = encoded

In [None]:
encoded = oe.fit_transform(

test[['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']].values)

test[['Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleCondition']] = encoded

# Futher Explanatory

#### I completed the missing values based on the information in the other columns. 

#### I predicted the missing values by using the column with the missing values as the objective variable and the other columns as explanatory variables.

In [None]:
# I checked the correlation coefficients between columns with missing values and other columns.

def check(df, column):
    col = np.abs(test.corr()[column])
    print(col.sort_values(ascending=False).head(13))

In [None]:
def missing_value_classify(df,column,column2,column3,column4,column5,column6,column7):
    target = df[[column,column2,column3,column4,column5,column6,column7]]
    notnull = target[target[column].notnull()].values
    null = target[target[column].isnull()].values
    X = notnull[:, 1:]
    y = notnull[:, 0]
    rf = RandomForestClassifier(random_state=0,n_estimators=1000,n_jobs=-1)
    rf.fit(X,y)
    predict = rf.predict(null[:, 1::])
    print(predict)
    df.loc[(df[column].isnull(), column)] = predict

In [None]:
# outlier
def outlier(df, columns):
    for col in columns:
        mean, std = df[col].mean(), df[col].std()
        border = np.abs(df[col] - mean) / std
        df = df[(border < 3.0)]
    return df

In [None]:
check(train, 'KitchenQual')

In [None]:
missing_value_classify(test,'KitchenQual','ExterQual','OverallQual','BsmtQual','YearRemodAdd','HeatingQC','TotalBsmtSF')

In [None]:
check(train, 'GarageCars')

In [None]:
missing_value_classify(test,'GarageCars','GarageArea','OverallQual','GarageYrBlt','YearBuilt','GrLivArea','FullBath')

In [None]:
train.isnull().sum()[train.isnull().sum() > 0]

In [None]:
test.isnull().sum()[test.isnull().sum() > 0]

In [None]:
# For example: 1999 → 11years (2010-1999)

train['YearBuilt']        =  train['YearBuilt'].max()    - train['YearBuilt']
train['YearRemodAdd']     =  train['YearRemodAdd'].max() - train['YearRemodAdd']


# Binning

train['YearBuilt_bin']    =  pd.cut(train['YearBuilt'], [-1,0.1,5,10,20,30,40,50,60,70,80,90,100,110,120,130,140])
train['YearRemodAdd_bin'] =  pd.cut(train['YearRemodAdd'], [-1,0,5,10,15,20,25,30,35,40,50,55,60,100])

In [None]:
test['YearBuilt']         =  test['YearBuilt'].max()    - test['YearBuilt']
test['YearRemodAdd']      =  test['YearRemodAdd'].max() - test['YearRemodAdd']


test['YearBuilt_bin']     =  pd.cut(test['YearBuilt'], [-1,0.1,5,10,20,30,40,50,60,70,80,90,100,110,120,130,140])
test['YearRemodAdd_bin']  =  pd.cut(test['YearRemodAdd'], [-1,0,5,10,15,20,25,30,35,40,50,55,60,100])

In [None]:
np.abs(train.corr()['SalePrice']).sort_values(ascending=False).head(15).index

In [None]:
train[['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt', 'ExterQual',
       'BsmtQual', 'YearRemodAdd', 'TotRmsAbvGrd', 'KitchenQual', 'GarageType']]

In [None]:
# Bining

train['GrLivArea_bin']   = pd.cut(train['GrLivArea'], 10)
train['GarageArea_bin']  = pd.cut(train['GarageArea'], 10)
train['TotalBsmtSF_bin'] = pd.cut(train['TotalBsmtSF'], 10)
train['1stFlrSF_bin']    = pd.cut(train['1stFlrSF'], 10)



test['GrLivArea_bin']   = pd.cut(test['GrLivArea'], 10)
test['GarageArea_bin']  = pd.cut(test['GarageArea'], 10)
test['TotalBsmtSF_bin'] = pd.cut(test['TotalBsmtSF'], 10)
test['1stFlrSF_bin']    = pd.cut(test['1stFlrSF'], 10)

In [None]:
oe = OrdinalEncoder()
encoded = oe.fit_transform(

train[['YearBuilt_bin','YearRemodAdd_bin','GrLivArea_bin','GarageArea_bin','TotalBsmtSF_bin','1stFlrSF_bin']].values)

train[['YearBuilt_bin','YearRemodAdd_bin','GrLivArea_bin','GarageArea_bin','TotalBsmtSF_bin','1stFlrSF_bin']] = encoded

In [None]:
encoded = oe.fit_transform(

test[['YearBuilt_bin','YearRemodAdd_bin','GrLivArea_bin','GarageArea_bin','TotalBsmtSF_bin','1stFlrSF_bin']].values)

test[['YearBuilt_bin','YearRemodAdd_bin','GrLivArea_bin','GarageArea_bin','TotalBsmtSF_bin','1stFlrSF_bin']] = encoded

In [None]:
np.abs(train.corr()['SalePrice']).sort_values(ascending=False).head(15)

In [None]:
# Total area of the room

train['TotalArea'] = train['1stFlrSF'] + train['2ndFlrSF'] + train['TotalBsmtSF'] + train['GrLivArea']

In [None]:
test['TotalArea']  = test['1stFlrSF'] + test['2ndFlrSF'] + test['TotalBsmtSF'] + test['GrLivArea']

In [None]:
plt.subplots(1,2,figsize=(14,8))
plt.subplot(121)
sns.regplot(x=train['TotalArea'], y=train['SalePrice'])
plt.grid(True)
plt.subplot(122)
sns.boxplot(x=train['OverallQual'], y=train['SalePrice'])
plt.show()

In [None]:
# outlier

train = train[(train['TotalArea'] < 12000) | (train['SalePrice'] > 12.50)]

train = outlier(train, ['OverallQual'])

In [None]:
plt.subplots(1,1,figsize=(18,7))
plt.subplot(111)
sns.regplot(x=train['TotalArea'], y=train['SalePrice'])
plt.grid(True)
plt.show()

In [None]:
train.shape

In [None]:
# To solve the multicollinearity problem

train.drop(['TotalArea','GarageArea'], axis=1, inplace=True)

In [None]:
test.drop(['TotalArea','GarageArea'], axis=1, inplace=True)

#### I used the yeo-johnson transform, which is a generalization of the logarithmic transform. 

In [None]:
pt = PowerTransformer(method='yeo-johnson')


pt.fit(train['GrLivArea'].values.reshape(-1,1))
train['GrLivArea']   = pt.transform(train['GrLivArea'].values.reshape(-1,1))

pt.fit(train['TotalBsmtSF'].values.reshape(-1,1))
train['TotalBsmtSF'] = pt.transform(train['TotalBsmtSF'].values.reshape(-1,1))

pt.fit(train['1stFlrSF'].values.reshape(-1,1))
train['1stFlrSF']    = pt.transform(train['1stFlrSF'].values.reshape(-1,1))



pt.fit(test['GrLivArea'].values.reshape(-1,1))
test['GrLivArea']   = pt.transform(test['GrLivArea'].values.reshape(-1,1))

pt.fit(test['TotalBsmtSF'].values.reshape(-1,1))
test['TotalBsmtSF'] = pt.transform(test['TotalBsmtSF'].values.reshape(-1,1))

pt.fit(test['1stFlrSF'].values.reshape(-1,1))
test['1stFlrSF']    = pt.transform(test['1stFlrSF'].values.reshape(-1,1))

In [None]:
y = train['SalePrice']
X = train.copy()
X.drop(['SalePrice'], axis=1, inplace=True)



X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=.2, random_state=0)

# Modeling

In [None]:
elastic_net = ElasticNet(alpha=0.005, l1_ratio=0.3)
elastic_net.fit(X_train, y_train)

In [None]:
y_valid_pred = elastic_net.predict(X_valid)

print(np.sqrt(mean_squared_error(y_valid_pred, y_valid)))

In [None]:
y_test = elastic_net.predict(test)

In [None]:
submission = pd.DataFrame({'Id':sample['Id'], 'SalePrice':np.exp(y_test)})

submission.to_csv('submission.csv',index=False)

In [None]:
submission