In [1]:
!ls ../../data/house-prices/

continuous_df.parquet processed_df.csv      test.csv
processed_df..parquet processed_df.parquet  train.csv


In [4]:
import pandas as pd

# Read data

In [1]:
DATA_PATH = '../../data/house-prices/'
DATASET_PATH = DATA_PATH + 'train.csv'
label_col = 'SalePrice'

In [2]:
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
CONTINUOUS_FEATURE_COLUMNS = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
CATEGORICAL_FEATURE_COLUMNS = ['Foundation', 'KitchenQual']

In [5]:
df_master = pd.read_csv(DATASET_PATH)
df = df_master.copy()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df = df[useful_features + [label_col]]
df.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
0,PConc,Gd,8,0,2008,856,208500
1,CBlock,TA,6,298,2007,1262,181500
2,PConc,Gd,6,0,2008,920,223500
3,BrkTil,Gd,7,0,2006,961,140000
4,PConc,Gd,9,192,2008,1145,250000


- Split dataset

In [7]:
X, y = df.drop(columns=[label_col]), df[label_col]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
del df

# Training

## Preprocessing

### Scale continuous feautres

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[CONTINUOUS_FEATURE_COLUMNS])
scaled_columns = scaler.transform(X_train[CONTINUOUS_FEATURE_COLUMNS])
scaled_columns

array([[-0.3305751 ,  0.17281309,  1.65360578, -0.29259108],
       [-0.3305751 , -0.74014413, -0.60614241, -0.12299393],
       [ 0.28759402,  0.84231506, -1.35939181,  1.15155436],
       ...,
       [-0.94874421, -0.74014413, -1.35939181, -0.78082531],
       [ 0.28759402, -0.74014413, -0.60614241, -0.65748193],
       [ 0.28759402,  0.34779656,  0.90035639,  0.99480548]])

In [11]:
X_train

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
615,PConc,Gd,6,120,2010,1054
613,PConc,TA,6,0,2007,1120
1303,PConc,Gd,7,208,2006,1616
486,CBlock,TA,6,0,2007,1073
561,CBlock,TA,6,240,2006,1389
...,...,...,...,...,...,...
1095,PConc,Gd,6,0,2007,1314
1130,BrkTil,Gd,7,431,2009,1328
1294,CBlock,TA,5,0,2006,864
860,BrkTil,Gd,7,0,2007,912


In [12]:
X_train_continuous = pd.DataFrame(data=scaled_columns, columns=CONTINUOUS_FEATURE_COLUMNS, index=X_train.index)
X_train_continuous.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
615,-0.330575,0.172813,1.653606,-0.292591
613,-0.330575,-0.740144,-0.606142,-0.122994
1303,0.287594,0.842315,-1.359392,1.151554
486,-0.330575,-0.740144,-0.606142,-0.243768
561,-0.330575,1.08577,-1.359392,0.568243


### Categorical features

In [13]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', dtype='int')
one_hot_encoder.fit(X_train[CATEGORICAL_FEATURE_COLUMNS])

In [14]:
X_train_categorical_sparse = one_hot_encoder.transform(X_train[CATEGORICAL_FEATURE_COLUMNS])
X_train_categorical_sparse

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1956 stored elements and shape (978, 10)>

In [15]:
X_train_categorical = pd.DataFrame.sparse.from_spmatrix(data=X_train_categorical_sparse,
                                  columns=one_hot_encoder.get_feature_names_out(),
                                  index=X_train.index)
X_train_categorical

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
615,0,0,1,0,0,0,0,0,1,0
613,0,0,1,0,0,0,0,0,0,1
1303,0,0,1,0,0,0,0,0,1,0
486,0,1,0,0,0,0,0,0,0,1
561,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1095,0,0,1,0,0,0,0,0,1,0
1130,1,0,0,0,0,0,0,0,1,0
1294,0,1,0,0,0,0,0,0,0,1
860,1,0,0,0,0,0,0,0,1,0


- Join dataframes

In [16]:
X_train_final = X_train_continuous.join(X_train_categorical)
X_train_final.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
615,-0.330575,0.172813,1.653606,-0.292591,0,0,1,0,0,0,0,0,1,0
613,-0.330575,-0.740144,-0.606142,-0.122994,0,0,1,0,0,0,0,0,0,1
1303,0.287594,0.842315,-1.359392,1.151554,0,0,1,0,0,0,0,0,1,0
486,-0.330575,-0.740144,-0.606142,-0.243768,0,1,0,0,0,0,0,0,0,1
561,-0.330575,1.08577,-1.359392,0.568243,0,1,0,0,0,0,0,0,0,1


In [17]:
df_master[useful_features].isna().sum()

Foundation      0
KitchenQual     0
TotRmsAbvGrd    0
WoodDeckSF      0
YrSold          0
1stFlrSF        0
dtype: int64

In [18]:
X_train.isna().sum()

Foundation      0
KitchenQual     0
TotRmsAbvGrd    0
WoodDeckSF      0
YrSold          0
1stFlrSF        0
dtype: int64

# Model training

In [19]:
display(X_train.head())
X_train_final.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
615,PConc,Gd,6,120,2010,1054
613,PConc,TA,6,0,2007,1120
1303,PConc,Gd,7,208,2006,1616
486,CBlock,TA,6,0,2007,1073
561,CBlock,TA,6,240,2006,1389


Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
615,-0.330575,0.172813,1.653606,-0.292591,0,0,1,0,0,0,0,0,1,0
613,-0.330575,-0.740144,-0.606142,-0.122994,0,0,1,0,0,0,0,0,0,1
1303,0.287594,0.842315,-1.359392,1.151554,0,0,1,0,0,0,0,0,1,0
486,-0.330575,-0.740144,-0.606142,-0.243768,0,1,0,0,0,0,0,0,0,1
561,-0.330575,1.08577,-1.359392,0.568243,0,1,0,0,0,0,0,0,0,1


In [20]:
X_train = X_train_final

- Train model

In [21]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [22]:
model.fit(X_train, y_train)



# Model evaluation

In [23]:
X_test.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
892,CBlock,TA,6,192,2006,1068
1105,PConc,Gd,9,186,2010,1500
413,CBlock,TA,5,0,2010,1028
522,CBlock,TA,7,0,2006,1004
1036,PConc,Ex,6,228,2009,1620


## Preprocessing

In [24]:
X_test_continuous = scaler.transform(X_test[CONTINUOUS_FEATURE_COLUMNS])
X_test_continuous

array([[-0.3305751 ,  0.72058743, -1.35939181, -0.25661593],
       [ 1.52393224,  0.67493957,  1.65360578,  0.85347452],
       [-0.94874421, -0.74014413,  1.65360578, -0.35940208],
       ...,
       [ 0.28759402,  2.02915944,  0.14710699,  0.17508591],
       [-0.94874421, -0.74014413,  0.90035639, -0.78082531],
       [ 2.14210136,  1.32922557,  1.65360578,  1.36483563]])

In [25]:
X_test_continuous = pd.DataFrame(data=X_test_continuous,
                                columns=CONTINUOUS_FEATURE_COLUMNS,
                                index=X_test.index)
X_test_continuous.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
892,-0.330575,0.720587,-1.359392,-0.256616
1105,1.523932,0.67494,1.653606,0.853475
413,-0.948744,-0.740144,1.653606,-0.359402
522,0.287594,-0.740144,-1.359392,-0.421074
1036,-0.330575,0.994475,0.900356,1.161833


In [26]:
X_test_categorical_sparse = one_hot_encoder.transform(X_test[CATEGORICAL_FEATURE_COLUMNS])
X_test_categorical_sparse

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 964 stored elements and shape (482, 10)>

In [27]:
X_test_categorical = pd.DataFrame.sparse.from_spmatrix(data=X_test_categorical_sparse,
                                                      columns=one_hot_encoder.get_feature_names_out(),
                                                      index=X_test.index)
X_test_categorical.head()

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
892,0,1,0,0,0,0,0,0,0,1
1105,0,0,1,0,0,0,0,0,1,0
413,0,1,0,0,0,0,0,0,0,1
522,0,1,0,0,0,0,0,0,0,1
1036,0,0,1,0,0,0,1,0,0,0


In [28]:
X_test_final = X_test_continuous.join(X_test_categorical)
X_test_final.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
892,-0.330575,0.720587,-1.359392,-0.256616,0,1,0,0,0,0,0,0,0,1
1105,1.523932,0.67494,1.653606,0.853475,0,0,1,0,0,0,0,0,1,0
413,-0.948744,-0.740144,1.653606,-0.359402,0,1,0,0,0,0,0,0,0,1
522,0.287594,-0.740144,-1.359392,-0.421074,0,1,0,0,0,0,0,0,0,1
1036,-0.330575,0.994475,0.900356,1.161833,0,0,1,0,0,0,1,0,0,0


## Prediction

In [29]:
X_test = X_test_final

In [30]:
y_pred = model.predict(X_test)



In [31]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [32]:
compute_rmsle(y_test, y_pred)

np.float64(0.22)