Importing Required Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.preprocessing import OneHotEncoder,StandardScaler

Loading Data

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
train_data.sample(4)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
599,600,160,RM,24.0,1950,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blueste,Norm,Norm,Twnhs,2Story,6,6,1980,1980,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,Gd,CBlock,Gd,TA,No,LwQ,81,GLQ,612,23,716,GasA,TA,Y,SBrkr,716,840,0,1556,1,0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1980.0,Fin,2,452,TA,TA,Y,161,0,0,0,0,0,,GdPrv,,0,7,2008,COD,Normal,151000
830,831,20,RL,80.0,11900,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1957,1957,Gable,CompShg,HdBoard,HdBoard,BrkFace,387.0,TA,TA,CBlock,TA,TA,No,Rec,1040,Unf,0,352,1392,GasA,TA,Y,FuseA,1392,0,0,1392,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1957.0,RFn,2,458,TA,TA,Y,0,0,0,0,192,0,,,,0,6,2008,WD,Normal,166000
1003,1004,90,RL,,11500,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,Feedr,RRAn,Duplex,1Story,5,6,1976,1976,Gable,CompShg,VinylSd,VinylSd,BrkFace,164.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,1680,1680,GasA,Fa,Y,SBrkr,1680,0,0,1680,0,0,2,0,4,2,TA,8,Typ,0,,Detchd,1976.0,Unf,2,528,TA,TA,Y,0,0,0,0,0,0,,,,0,6,2007,WD,Normal,136905
1322,1323,60,RL,107.0,10186,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NoRidge,Norm,Norm,1Fam,2Story,7,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,GLQ,674,Unf,0,76,750,GasA,Ex,Y,SBrkr,1061,862,0,1923,1,0,2,1,3,1,Gd,8,Typ,1,TA,Attchd,1992.0,RFn,2,564,TA,TA,Y,240,39,0,0,0,0,,,,0,6,2010,WD,Normal,190000


In [None]:
print(train_data.isnull().sum())

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [None]:
train_data['SaleCondition'].unique()

array(['Normal', 'Abnorml', 'Partial', 'AdjLand', 'Alloca', 'Family'],
      dtype=object)

In [None]:
X_train = train_data.drop(columns=['SalePrice'])
y_train = np.log1p(train_data['SalePrice'].clip(lower=1))

In [None]:
cat_cols = X_train.select_dtypes('object').columns

In [None]:
num_cols = X_train.select_dtypes(['int','float']).columns

In [None]:
num_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('sclaor',StandardScaler())

])

In [None]:
cat_pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(drop='first',handle_unknown='ignore'))

])

In [None]:
transformer = ColumnTransformer([
    ('trf1',cat_pipe,cat_cols),
    ('trf2',num_pipe,num_cols),
],remainder='passthrough')

In [None]:
pipe = Pipeline([
    ('transformer',transformer),
    ('model',RandomForestRegressor())
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
X_test_transformed = pipe.named_steps['transformer'].transform(test_data)
X_train_transformed = pipe.named_steps['transformer'].transform(train_data)


In [None]:
y_pred = np.expm1(pipe.named_steps['model'].predict(X_test_transformed))

In [None]:
dump = pd.read_csv('test.csv')
submission = pd.DataFrame({'Id':dump['Id'],'SalePrice':y_pred})
submission.to_csv('Final_Submission.csv',index = False )

In [None]:
print(y_pred[:10])
print(y_pred.min(), y_pred.max())

[126880.28758628 150381.4451799  179222.16637029 180072.99187168
 191523.33278266 183315.02969355 164694.73026117 174922.2500407
 181495.74931059 119253.12181008]
53514.757720877984 499559.56659328664
