In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings 
warnings.filterwarnings("ignore")

In [22]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv' , index_col = 'Id')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv' , index_col = 'Id')
data = pd.concat([train_data,test_data], sort= True)

In [23]:
cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2']
for col in cols_fillna:
    data[col].fillna('None',inplace=True)
data.loc[data.GarageYrBlt.isnull(),'GarageYrBlt'] = data.loc[data.GarageYrBlt.isnull(),'YearBuilt']
data.MasVnrArea.fillna(0,inplace=True)   
data.BsmtFullBath.fillna(0,inplace=True)
data.BsmtHalfBath.fillna(0,inplace=True)
data.BsmtFinSF1.fillna(0,inplace=True)
data.BsmtFinSF2.fillna(0,inplace=True)
data.BsmtUnfSF.fillna(0,inplace=True)
data.TotalBsmtSF.fillna(0,inplace=True)
data.GarageArea.fillna(0,inplace=True)
data.GarageCars.fillna(0,inplace=True)
data.LotFrontage.fillna(data['LotFrontage'].mode() , inplace = True)
    

In [24]:
cols_with_na = data.drop('SalePrice',axis=1).isnull().sum()
cols_with_na = cols_with_na[cols_with_na>0]
for col in cols_with_na.index:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [25]:
cols_ExGd = ['ExterQual','ExterCond','BsmtQual','BsmtCond',
             'HeatingQC','KitchenQual','FireplaceQu','GarageQual',
            'GarageCond','PoolQC']

dict_ExGd = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'None':0}

for col in cols_ExGd:
    data[col].replace(dict_ExGd, inplace=True)
data['BsmtExposure'].replace({'Gd':4,'Av':3,'Mn':2,'No':1,'None':0}, inplace=True)

data['CentralAir'].replace({'Y':1,'N':0}, inplace=True)

data['Functional'].replace({'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0}, inplace=True)

data['GarageFinish'].replace({'Fin':3,'RFn':2,'Unf':1,'None':0}, inplace=True)

data['LotShape'].replace({'Reg':3,'IR1':2,'IR2':1,'IR3':0}, inplace=True)

data['Utilities'].replace({'AllPub':3,'NoSewr':2,'NoSeWa':1,'ELO':0}, inplace=True)

data['LandSlope'].replace({'Gtl':2,'Mod':1,'Sev':0}, inplace=True)

In [26]:
mask = data.dtypes == float
float_cols = data.columns[mask]
skew_limit = 0.75
skew_vals = data[float_cols].skew()

skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {0}'.format(skew_limit)))
for col in skew_cols.index.tolist():
    if col == "SalePrice":
        continue
    data[col] = np.log1p(data[col])

In [27]:
data = pd.get_dummies(data)
id_train = train_data.index    
id_test = test_data.index

X_train = data.loc[id_train].drop('SalePrice',axis=1)
y_train= data.loc[id_train].SalePrice
X_test = data.loc[id_test].drop('SalePrice',axis=1)
from sklearn.linear_model import BayesianRidge,LinearRegression
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
#Creating the Model
model = make_pipeline(
    PowerTransformer(),
    StackingRegressor(estimators = [("br",BayesianRidge()),("kn",KNeighborsRegressor()),("rf",RandomForestRegressor()),("dt",DecisionTreeRegressor()),("gpr",GaussianProcessRegressor())],final_estimator=RandomForestRegressor())
)
model.fit(X_train,y_train)
predictions = model.predict(X_test)

In [28]:
submissions = pd.DataFrame({'Id': id_test, 'SalePrice': predictions})
submissions.to_csv('submissionY1.csv', index=False)