In [1]:
!ls ../../data/house-prices/

continuous_df.parquet processed_df.csv      test.csv
processed_df..parquet processed_df.parquet  train.csv


In [2]:
import pandas as pd

# Read data

In [3]:
DATA_PATH = '../../data/house-prices/'
DATASET_PATH = DATA_PATH + 'train.csv'
label_col = 'SalePrice'

In [4]:
df_master = pd.read_csv(DATASET_PATH)
df = df_master.copy()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
df = df[useful_features + [label_col]]
df.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
0,PConc,Gd,8,0,2008,856,208500
1,CBlock,TA,6,298,2007,1262,181500
2,PConc,Gd,6,0,2008,920,223500
3,BrkTil,Gd,7,0,2006,961,140000
4,PConc,Gd,9,192,2008,1145,250000


### Scale continuous feautres

In [6]:
CONTINUOUS_FEATURE_COLUMNS = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
CONTINUOUS_FEATURE_COLUMNS

['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[CONTINUOUS_FEATURE_COLUMNS])
scaled_columns = scaler.transform(df[CONTINUOUS_FEATURE_COLUMNS])
scaled_columns

array([[ 0.91220977, -0.75217584,  0.13877749, -0.79343379],
       [-0.31868327,  1.62619479, -0.61443862,  0.25714043],
       [-0.31868327, -0.75217584,  0.13877749, -0.62782603],
       ...,
       [ 1.52765629, -0.75217584,  1.64520971,  0.06565646],
       [-0.93412978,  2.16891024,  1.64520971, -0.21898188],
       [-0.31868327,  5.12192075,  0.13877749,  0.2416147 ]])

In [8]:
continuous_features_df = pd.DataFrame(data=scaled_columns, columns=CONTINUOUS_FEATURE_COLUMNS, index=df.index)
continuous_features_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,0.91221,-0.752176,0.138777,-0.793434
1,-0.318683,1.626195,-0.614439,0.25714
2,-0.318683,-0.752176,0.138777,-0.627826
3,0.296763,-0.752176,-1.367655,-0.521734
4,1.527656,0.780197,0.138777,-0.045611


In [9]:
continuous_features_df.to_parquet(DATA_PATH + 'continuous_df.parquet', index=False)

### Categorical features

In [10]:
CATEGORICAL_FEATURE_COLUMNS = ['Foundation', 'KitchenQual']
CATEGORICAL_FEATURE_COLUMNS

['Foundation', 'KitchenQual']

In [11]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(handle_unknown='ignore', dtype='int')
one_hot_encoder.fit(df[CATEGORICAL_FEATURE_COLUMNS])

In [12]:
categorical_features_sparse = one_hot_encoder.transform(df[CATEGORICAL_FEATURE_COLUMNS])
categorical_features_sparse

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2920 stored elements and shape (1460, 10)>

In [13]:
categorical_features_df = pd.DataFrame.sparse.from_spmatrix(data=categorical_features_sparse,
                                                            columns=one_hot_encoder.get_feature_names_out(),
                                                            index=df.index)
categorical_features_df

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0,0,1,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1455,0,0,1,0,0,0,0,0,0,1
1456,0,1,0,0,0,0,0,0,0,1
1457,0,0,0,0,1,0,0,0,1,0
1458,0,1,0,0,0,0,0,0,1,0


- Join dataframes

In [14]:
final_df = continuous_features_df.join(categorical_features_df).join(df[label_col])
final_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,SalePrice
0,0.91221,-0.752176,0.138777,-0.793434,0,0,1,0,0,0,0,0,1,0,208500
1,-0.318683,1.626195,-0.614439,0.25714,0,1,0,0,0,0,0,0,0,1,181500
2,-0.318683,-0.752176,0.138777,-0.627826,0,0,1,0,0,0,0,0,1,0,223500
3,0.296763,-0.752176,-1.367655,-0.521734,1,0,0,0,0,0,0,0,1,0,140000
4,1.527656,0.780197,0.138777,-0.045611,0,0,1,0,0,0,0,0,1,0,250000


# Model training

- Split dataset

In [15]:
X, y = final_df.drop(columns=[label_col]), final_df[label_col]

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

- Train model

In [17]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [18]:
model.fit(X_train, y_train)



# Model evaluation

In [19]:
y_pred = model.predict(X_test)



In [20]:
y_pred[y_pred < 0] = 0

In [21]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [22]:
compute_rmsle(y_test, y_pred)

np.float64(0.22)

## Testing dataframe equality

In [23]:
!ls ../../data/house-prices/

continuous_df.parquet  test.csv			 train.csv
processed_df.parquet   test.csv:Zone.Identifier  train.csv:Zone.Identifier


In [24]:
processed_df = pd.read_parquet(DATA_PATH + 'processed_df.parquet')
processed_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,SalePrice
0,0.91221,-0.752176,0.138777,-0.793434,False,False,True,False,False,False,False,False,True,False,208500
1,-0.318683,1.626195,-0.614439,0.25714,False,True,False,False,False,False,False,False,False,True,181500
2,-0.318683,-0.752176,0.138777,-0.627826,False,False,True,False,False,False,False,False,True,False,223500
3,0.296763,-0.752176,-1.367655,-0.521734,True,False,False,False,False,False,False,False,True,False,140000
4,1.527656,0.780197,0.138777,-0.045611,False,False,True,False,False,False,False,False,True,False,250000


In [25]:
pd.testing.assert_frame_equal(processed_df, final_df)

AssertionError: Attributes of DataFrame.iloc[:, 4] (column name="Foundation_BrkTil") are different

Attribute "dtype" are different
[left]:  bool
[right]: Sparse[int64, 0]

In [None]:
pd.testing.assert_frame_equal(processed_df, final_df.drop(columns=[label_col]))