In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder,RobustScaler
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.pipeline import make_pipeline

# Data Anaylsis

In [3]:
house_data=pd.read_csv("house_train.csv")
test_data=pd.read_csv("house_test.csv")
print(house_data.shape,test_data.shape)

(1460, 81) (1459, 80)


In [4]:
X_train=house_data.drop(['SalePrice','Id'], axis=1)
X_test=test_data.drop(['Id'],axis=1)
full_data=pd.concat([X_train,X_test]).reset_index(drop=True)
print(full_data.shape)

(2919, 79)


#Missing values

In [5]:
full_data.info()
print(full_data.isnull().sum().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     2919 non-null   int64  
 1   MSZoning       2915 non-null   object 
 2   LotFrontage    2433 non-null   float64
 3   LotArea        2919 non-null   int64  
 4   Street         2919 non-null   object 
 5   Alley          198 non-null    object 
 6   LotShape       2919 non-null   object 
 7   LandContour    2919 non-null   object 
 8   Utilities      2917 non-null   object 
 9   LotConfig      2919 non-null   object 
 10  LandSlope      2919 non-null   object 
 11  Neighborhood   2919 non-null   object 
 12  Condition1     2919 non-null   object 
 13  Condition2     2919 non-null   object 
 14  BldgType       2919 non-null   object 
 15  HouseStyle     2919 non-null   object 
 16  OverallQual    2919 non-null   int64  
 17  OverallCond    2919 non-null   int64  
 18  YearBuil

In [6]:
corr_matrix=house_data.corr()
print(corr_matrix['SalePrice'].sort_values(ascending=False))

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [7]:
print(full_data.groupby('Neighborhood')['LotFrontage'].mean())
full_data['LotFrontage']=full_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x:x.fillna(x.mean()))

Neighborhood
Blmngtn    46.900000
Blueste    27.300000
BrDale     21.500000
BrkSide    55.789474
ClearCr    88.150000
CollgCr    71.336364
Crawfor    69.951807
Edwards    66.910112
Gilbert    74.207207
IDOTRR     62.241379
MeadowV    25.606061
Mitchel    75.144444
NAmes      75.210667
NPkVill    28.142857
NWAmes     81.517647
NoRidge    91.629630
NridgHt    84.184049
OldTown    61.777293
SWISU      59.068182
Sawyer     74.551020
SawyerW    70.669811
Somerst    64.549383
StoneBr    62.173913
Timber     81.157895
Veenker    72.000000
Name: LotFrontage, dtype: float64


In [8]:
#columns with numerical data
num_dtypes = ['int64', 'float64']
num = []
for i in full_data.columns:
    if full_data[i].dtype in num_dtypes:
        num.append(i)
print(num)
print(full_data[num].isnull().sum().sum())

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
192


In [9]:
#for rest of the numerical data, it consist of negligible missing values and mostly doesnt not have that feature. 
#So replacing all missing values with zero.
full_data[num]=full_data[num].fillna(0)
print(full_data[num].isnull().sum().sum())

0


In [10]:
#this are categorical values in numerical type
full_data[['MSSubClass','MoSold', 'YrSold']]=full_data[['MSSubClass','MoSold', 'YrSold']].astype(str)

In [11]:
print(full_data.groupby(['MSSubClass']).MSZoning)
full_data['MSZoning'] = full_data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001ED0B69E0D0>


In [12]:
obj=[]
for i in full_data.columns:
    if full_data[i].dtype==object:
        obj.append(i)
print(obj)
print(full_data[obj].isnull().sum().sum())

['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
13283


In [13]:
A=[]
for i in obj:
    A.append([i,full_data[i].value_counts()])
print(A)

[['MSSubClass', 20     1079
60      575
50      287
120     182
30      139
70      128
160     128
80      118
90      109
190      61
85       48
75       23
45       18
180      17
40        6
150       1
Name: MSSubClass, dtype: int64], ['MSZoning', RL         2267
RM          462
FV          139
RH           26
C (all)      25
Name: MSZoning, dtype: int64], ['Street', Pave    2907
Grvl      12
Name: Street, dtype: int64], ['Alley', Grvl    120
Pave     78
Name: Alley, dtype: int64], ['LotShape', Reg    1859
IR1     968
IR2      76
IR3      16
Name: LotShape, dtype: int64], ['LandContour', Lvl    2622
HLS     120
Bnk     117
Low      60
Name: LandContour, dtype: int64], ['Utilities', AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64], ['LotConfig', Inside     2133
Corner      511
CulDSac     176
FR2          85
FR3          14
Name: LotConfig, dtype: int64], ['LandSlope', Gtl    2778
Mod     125
Sev      16
Name: LandSlope, dtype: int64], ['Neighborhood', NAmes      443
C

In [14]:
full_data['Functional'] = full_data['Functional'].fillna('Typ') 
full_data['Electrical'] = full_data['Electrical'].fillna("SBrkr") 
full_data['KitchenQual'] = full_data['KitchenQual'].fillna("TA")
full_data['Exterior1st'] = full_data['Exterior1st'].fillna("VinylSd")
full_data['SaleType'] = full_data['SaleType'].fillna('WD')

In [15]:
#fill remaining object columns with None
full_data[obj]=full_data[obj].fillna('None')

# Feature Engineering

In [16]:
print(house_data.groupby(['Street']).SalePrice.agg([len,min,max]))
print(house_data.groupby(['Alley']).SalePrice.agg([len,min,max]))
print(full_data['Utilities'].value_counts())
print(full_data['PoolQC'].value_counts())

         len    min     max
Street                     
Grvl       6  55993  228950
Pave    1454  34900  755000
       len    min     max
Alley                    
Grvl    50  52500  256000
Pave    41  40000  265979
AllPub    2916
None         2
NoSeWa       1
Name: Utilities, dtype: int64
None    2909
Ex         4
Gd         4
Fa         2
Name: PoolQC, dtype: int64


In [17]:
#Dropping columns with very few data and not very usefull.
full_data=full_data.drop(['Utilities','Alley','PoolQC'],axis=1)

In [18]:
full_data['YrBltAndRemod']=full_data['YearBuilt']+full_data['YearRemodAdd']
full_data['TotalSF']=full_data['TotalBsmtSF'] + full_data['1stFlrSF'] + full_data['2ndFlrSF']

full_data['Total_footage'] = (full_data['BsmtFinSF1'] + full_data['BsmtFinSF2'] +
                                 full_data['1stFlrSF'] + full_data['2ndFlrSF'])

full_data['Total_Bath'] = (full_data['FullBath'] + (0.5 * full_data['HalfBath']) +
                               full_data['BsmtFullBath'] + (0.5 * full_data['BsmtHalfBath']))

full_data['Total_porch_sf'] = (full_data['OpenPorchSF'] + full_data['3SsnPorch'] +
                              full_data['EnclosedPorch'] + full_data['ScreenPorch'] +
                              full_data['WoodDeckSF'])

In [19]:
full_data['haspool'] = full_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
full_data['has2ndfloor'] = full_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
full_data['hasgarage'] = full_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
full_data['hasbsmt'] = full_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
full_data['hasfireplace'] = full_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [20]:
# changing categorical values to numerical values - OneHotEncoding
final_data=pd.get_dummies(full_data).reset_index(drop=True)
final_data.shape

(2919, 334)

In [21]:
y_train=house_data['SalePrice']
X_train=final_data.iloc[:len(y_train),:]
X_test=final_data.iloc[len(y_train):,:]

In [22]:
print(X_train)

      LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \
0            65.0     8450            7            5       2003          2003   
1            80.0     9600            6            8       1976          1976   
2            68.0    11250            7            5       2001          2002   
3            60.0     9550            7            5       1915          1970   
4            84.0    14260            8            5       2000          2000   
...           ...      ...          ...          ...        ...           ...   
1455         62.0     7917            6            5       1999          2000   
1456         85.0    13175            6            6       1978          1988   
1457         66.0     9042            7            9       1941          2006   
1458         68.0     9717            5            6       1950          1996   
1459         75.0     9937            5            6       1965          1965   

      MasVnrArea  BsmtFinSF

# Modelling

In [23]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=kfolds))
    print(rmse.mean())  

In [24]:
model0=make_pipeline(RobustScaler(), RidgeCV(cv=kfolds))
cv_rmse(model0)

31123.3275014952


In [25]:
model1=make_pipeline(RobustScaler(), LassoCV(cv=kfolds))
cv_rmse(model1)

33242.89916649995


In [26]:
model2=ElasticNetCV(cv=kfolds)
cv_rmse(model2)

47722.71614247949


In [27]:
model3=make_pipeline(RobustScaler(),XGBRegressor(learning_rate=0.01,n_estimators=3000))
cv_rmse(model3)

26319.38280536714


In [28]:
model4 = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, random_state =42)
cv_rmse(model4)

24795.2661996215


In [29]:
model5=make_pipeline(RobustScaler(),RandomForestRegressor(n_estimators = 100))
cv_rmse(model5)

27799.543341443776


In [30]:
stack= StackingCVRegressor(regressors=(model0,model1,model3,model4,model5),
                                meta_regressor=model3,use_features_in_secondary=True)

In [31]:
stack.fit(np.array(X_train),np.array(y_train))

StackingCVRegressor(meta_regressor=Pipeline(steps=[('robustscaler',
                                                    RobustScaler()),
                                                   ('xgbregressor',
                                                    XGBRegressor(base_score=None,
                                                                 booster=None,
                                                                 colsample_bylevel=None,
                                                                 colsample_bynode=None,
                                                                 colsample_bytree=None,
                                                                 enable_categorical=False,
                                                                 gamma=None,
                                                                 gpu_id=None,
                                                                 importance_type=None,
                                    

In [32]:
ridge=model0.fit(X_train,y_train)
lasso=model1.fit(X_train,y_train)
xgb=model3.fit(X_train,y_train)
gradient=model4.fit(X_train,y_train)
forest=model5.fit(X_train,y_train)

In [33]:
def blend_models(X):
    return ((0.05 * lasso.predict(X)) + \
            (0.1 * ridge.predict(X)) + \
            (0.15 * forest.predict(X)) + \
            (0.2 * gradient.predict(X)) + \
            (0.15 * xgb.predict(X)) + \
            ((0.35 * stack.predict(np.array(X)))))

In [34]:
y_pred=blend_models(X_test)

In [35]:
y_sub=pd.DataFrame(y_pred)
y_sub.to_csv('sub1.csv')