In [1]:
!ls ../data/house-prices/

processed_train_df.parquet
test.csv
train.csv


In [2]:
import pandas as pd


# Model Building

## Model Training

### Read Data

In [3]:
DATA_PATH = '../data/house-prices/'
DATASET_PATH = DATA_PATH + 'train.csv'
label_col = 'SalePrice'

In [4]:
df_master = pd.read_csv(DATASET_PATH)
df = df_master.copy()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Filter useless columns

In [5]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

### Missing Values Check

In [7]:
df[useful_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    1460 non-null   object
 1   KitchenQual   1460 non-null   object
 2   TotRmsAbvGrd  1460 non-null   int64 
 3   WoodDeckSF    1460 non-null   int64 
 4   YrSold        1460 non-null   int64 
 5   1stFlrSF      1460 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 68.6+ KB


### Check Duplicates

In [8]:
df[useful_features].duplicated(keep='first').sum()

32

In [9]:
#### Removing Duplicates
df = df[~df[useful_features].duplicated(keep='first')]
df.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [10]:
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1423,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1424,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1425,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1426,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1427,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


### Dataset Split

In [11]:
X, y = df.drop(columns=[label_col]), df[label_col]

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
X_train = X_train[useful_features]
X_train.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
266,CBlock,TA,6,370,2007,1113
327,CBlock,TA,10,155,2007,1728
348,CBlock,TA,5,48,2008,616
478,PConc,Ex,7,0,2009,916
59,CBlock,TA,4,196,2008,780


In [14]:
X_test = X_test[useful_features]
X_test.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
123,PConc,Gd,5,0,2008,1199
1189,PConc,Gd,8,635,2010,1055
462,CBlock,TA,5,0,2007,1277
351,PConc,Gd,6,0,2007,1200
1009,CBlock,TA,6,0,2007,912


#### Analysis Part (Kept as it is since it was difficult to understand)

In [15]:
df_tmp = df_master[useful_features+[label_col]]
df_tmp[df_tmp.duplicated(keep=False)]

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
145,PConc,Gd,7,0,2006,970,130000
193,PConc,Gd,7,0,2006,970,130000


In [16]:
df_tmp = df_master[useful_features+[label_col]]
df_tmp[df_tmp[useful_features].duplicated(keep=False)]

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
18,PConc,Gd,6,0,2008,1114,159000
76,CBlock,TA,4,0,2008,952,135750
87,PConc,Gd,4,0,2009,612,164500
89,PConc,TA,5,0,2007,990,123600
102,Slab,TA,8,0,2009,1535,118964
126,CBlock,TA,5,0,2007,958,128000
145,PConc,Gd,7,0,2006,970,130000
193,PConc,Gd,7,0,2006,970,130000
194,CBlock,TA,5,0,2008,864,127000
203,PConc,Gd,3,149,2008,848,149000


In [17]:
df_tmp2 = df_tmp[df_tmp[useful_features].duplicated(keep=False)]
df_tmp2[(df_tmp2['Foundation'] == 'PConc') & (df_tmp2['KitchenQual'] == 'Gd') & (df_tmp2['TotRmsAbvGrd'] == 6)]

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
18,PConc,Gd,6,0,2008,1114,159000
282,PConc,Gd,6,172,2009,1314,207500
511,PConc,Gd,6,132,2006,1368,202665
1157,PConc,Gd,6,172,2009,1314,230000
1306,PConc,Gd,6,132,2006,1368,202500
1341,PConc,Gd,6,0,2008,1114,155000


### Scale continuous features of the Train Set

In [18]:
continuous_columns = X_train[useful_features].select_dtypes(include='number').columns
continuous_columns

Index(['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF'], dtype='object')

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[continuous_columns])

In [69]:
joblib.dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [20]:
scaled_columns_Xtrain = scaler.transform(X_train[continuous_columns])
scaled_columns_Xtrain

array([[-0.37686857,  2.17600714, -0.62349243, -0.1457359 ],
       [ 2.074059  ,  0.46789767, -0.62349243,  1.40975046],
       [-0.98960046, -0.38218472,  0.13775759, -1.40277122],
       ...,
       [ 0.23586332,  0.80951957, -1.38474244,  1.55897598],
       [ 0.23586332, -0.76353009,  1.66025761, -0.92727295],
       [-0.98960046, -0.76353009,  1.66025761, -1.26113344]])

In [21]:
continuous_features_train_df = pd.DataFrame(data=scaled_columns_Xtrain, columns=continuous_columns)
continuous_features_train_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,-0.376869,2.176007,-0.623492,-0.145736
1,2.074059,0.467898,-0.623492,1.40975
2,-0.9896,-0.382185,0.137758,-1.402771
3,0.235863,-0.76353,0.899008,-0.643997
4,-1.602332,0.79363,0.137758,-0.987975


### Categorical features Encoding For Train Set

In [22]:
categorical_columns = df[useful_features].select_dtypes(include='object').columns
categorical_columns_list = categorical_columns.tolist()
categorical_columns_list

['Foundation', 'KitchenQual']

In [24]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

    
encoder.fit(X_train[categorical_columns])
encoded_train_data = encoder.transform(X_train[categorical_columns_list]).toarray()



In [70]:
joblib.dump(encoder, '../models/encoder.joblib')

['../models/encoder.joblib']

In [25]:
labels = encoder.get_feature_names_out(categorical_columns_list)
labels

array(['Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc',
       'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood',
       'KitchenQual_Ex', 'KitchenQual_Fa', 'KitchenQual_Gd',
       'KitchenQual_TA'], dtype=object)

In [26]:
categorical_features_train_df = pd.DataFrame(data = encoded_train_data, columns= labels)
categorical_features_train_df

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
951,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
952,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
953,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
954,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


- Join dataframes

In [27]:
final_train_df = continuous_features_train_df.join(categorical_features_train_df)
final_train_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.376869,2.176007,-0.623492,-0.145736,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2.074059,0.467898,-0.623492,1.40975,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.9896,-0.382185,0.137758,-1.402771,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.235863,-0.76353,0.899008,-0.643997,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-1.602332,0.79363,0.137758,-0.987975,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Linear Regression Model Training

In [28]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [29]:
model.fit(final_train_df, y_train)

In [57]:
import joblib
joblib.dump(model, '../models/model.joblib')

['../models/model.joblib']

In [58]:
!ls ../models

model.joblib


## Model evaluation

### Scale continuous features of the Test Set

In [30]:
scaled_columns_Xtest = scaler.transform(X_test[continuous_columns])
scaled_columns_Xtest

array([[-0.98960046, -0.76353009,  0.13775759,  0.07177926],
       [ 0.84859522,  4.28135138,  1.66025761, -0.29243218],
       [-0.98960046, -0.76353009, -0.62349243,  0.26906046],
       ...,
       [-0.98960046, -0.76353009,  1.66025761, -0.77551819],
       [-0.98960046,  0.06271822, -0.62349243, -1.01832581],
       [-0.98960046, -0.76353009,  0.13775759, -0.97785787]])

In [31]:
continuous_features_test_df = pd.DataFrame(data=scaled_columns_Xtest, columns=continuous_columns)
continuous_features_test_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,-0.9896,-0.76353,0.137758,0.071779
1,0.848595,4.281351,1.660258,-0.292432
2,-0.9896,-0.76353,-0.623492,0.26906
3,-0.376869,-0.76353,-0.623492,0.074309
4,-0.376869,-0.76353,-0.623492,-0.654114


### Categorical features Encoding For Test Set

In [32]:
encoded_test_data = encoder.transform(X_test[categorical_columns_list]).toarray()

In [33]:
categorical_features_test_df = pd.DataFrame(data = encoded_test_data, columns= labels)
categorical_features_test_df

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
467,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
468,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
469,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
470,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
final_test_df = continuous_features_test_df.join(categorical_features_test_df)
final_test_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.9896,-0.76353,0.137758,0.071779,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.848595,4.281351,1.660258,-0.292432,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.9896,-0.76353,-0.623492,0.26906,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.376869,-0.76353,-0.623492,0.074309,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.376869,-0.76353,-0.623492,-0.654114,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [35]:
y_pred = model.predict(final_test_df)

In [36]:
y_pred[y_pred < 0] = 0

In [37]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [38]:
compute_rmsle(y_test, y_pred)

0.54

# Model Inference

In [39]:
TEST_DATASET_PATH = DATA_PATH + 'test.csv'

In [64]:
df_test_loaded = pd.read_csv(TEST_DATASET_PATH)
df_test_loaded

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [65]:
df_test = df_test_loaded[useful_features]
df_test

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,CBlock,TA,5,140,2010,896
1,CBlock,Gd,6,393,2010,1329
2,PConc,TA,6,212,2010,928
3,PConc,Gd,7,360,2010,926
4,PConc,Gd,5,0,2010,1280
...,...,...,...,...,...,...
1454,CBlock,TA,5,0,2006,546
1455,CBlock,TA,6,0,2006,546
1456,CBlock,TA,7,474,2006,1224
1457,PConc,TA,6,80,2006,970


In [42]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    1459 non-null   object
 1   KitchenQual   1458 non-null   object
 2   TotRmsAbvGrd  1459 non-null   int64 
 3   WoodDeckSF    1459 non-null   int64 
 4   YrSold        1459 non-null   int64 
 5   1stFlrSF      1459 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 68.5+ KB


In [87]:
df_test[useful_features].duplicated(keep='first').sum()

28

In [88]:
df_test = df_test[~df_test[useful_features].duplicated(keep='first')]
df.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1423,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1424,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1425,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1426,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1427,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [89]:
df_test = df_test.dropna()

In [90]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1430 entries, 0 to 1458
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    1430 non-null   object
 1   KitchenQual   1430 non-null   object
 2   TotRmsAbvGrd  1430 non-null   int64 
 3   WoodDeckSF    1430 non-null   int64 
 4   YrSold        1430 non-null   int64 
 5   1stFlrSF      1430 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 78.2+ KB


In [91]:
loaded_scaler = joblib.load('../models/scaler.joblib')
loaded_encoder = joblib.load('../models/encoder.joblib')

In [92]:
scaled_columns_df_test = loaded_scaler.transform(df_test[continuous_columns])
scaled_columns_df_test

array([[-0.98960046,  0.34872724,  1.66025761, -0.69458231],
       [-0.37686857,  2.35873514,  1.66025761,  0.40058126],
       [-0.37686857,  0.9207453 ,  1.66025761, -0.61364643],
       ...,
       [ 0.23586332,  3.00225545, -1.38474244,  0.13501042],
       [-0.37686857, -0.12795447, -1.38474244, -0.5074181 ],
       [ 1.46132711,  0.74596201, -1.38474244, -0.4416577 ]])

In [93]:
df_test_continous_fts = pd.DataFrame(data=scaled_columns_df_test, columns=continuous_columns)
df_test_continous_fts.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,-0.9896,0.348727,1.660258,-0.694582
1,-0.376869,2.358735,1.660258,0.400581
2,-0.376869,0.920745,1.660258,-0.613646
3,0.235863,2.09656,1.660258,-0.618705
4,-0.9896,-0.76353,1.660258,0.276648


In [94]:
test_df_categorical_fts = loaded_encoder.transform(df_test[categorical_columns_list]).toarray()

In [95]:
df_test_categorical = pd.DataFrame(data = test_df_categorical_fts, columns= labels)
df_test_categorical

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1425,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1426,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1427,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1428,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [96]:
dataframe_test = df_test_continous_fts.join(df_test_categorical)
dataframe_test.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.9896,0.348727,1.660258,-0.694582,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.376869,2.358735,1.660258,0.400581,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.376869,0.920745,1.660258,-0.613646,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.235863,2.09656,1.660258,-0.618705,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.9896,-0.76353,1.660258,0.276648,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Loaded model from joblib

In [97]:
loaded_model = joblib.load('../models/model.joblib')

In [98]:
y_pred_inference = loaded_model.predict(dataframe_test)

In [99]:
y_pred_inference

array([116745.48155931, 210038.65668825, 164615.56463793, ...,
       192045.575932  , 159962.4757219 , 209724.41166686])

In [100]:
y_pred_inference.shape

(1430,)

## Testing dataframe equality

In [74]:
!ls ../data/house-prices/

processed_test_df.parquet
processed_train_df.parquet
test.csv
train.csv


In [73]:
#dataframe_test.to_parquet(DATA_PATH + 'processed_test_df.parquet', index=False)

In [101]:
processed_test_df = pd.read_parquet(DATA_PATH + 'processed_test_df.parquet')
processed_test_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.9896,0.348727,1.660258,-0.694582,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.376869,2.358735,1.660258,0.400581,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.376869,0.920745,1.660258,-0.613646,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.235863,2.09656,1.660258,-0.618705,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.9896,-0.76353,1.660258,0.276648,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [102]:
pd.testing.assert_frame_equal(processed_test_df, dataframe_test)