In [3]:
!ls ../data/house-prices/

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [30]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_log_error


# Model Building

## Model Training

### Read Data

In [89]:
DATA_PATH = '../data/house-prices/'
DATASET_PATH = DATA_PATH + 'train.csv'
label_col = 'SalePrice'

In [90]:
df = pd.read_csv(DATASET_PATH)

### Filter useless columns

In [91]:
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

### Missing Values Check

In [9]:
df[useful_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    1460 non-null   object
 1   KitchenQual   1460 non-null   object
 2   TotRmsAbvGrd  1460 non-null   int64 
 3   WoodDeckSF    1460 non-null   int64 
 4   YrSold        1460 non-null   int64 
 5   1stFlrSF      1460 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 68.6+ KB


In [92]:
loaded_scaler = joblib.load('../models/scaler.joblib')
loaded_encoder = joblib.load('../models/encoder.joblib')
loaded_model = joblib.load('../models/model.joblib')

### Drop Duplicates

In [93]:
#### Duplicates Drop function

def drop_duplicates(data: pd.DataFrame) -> pd.DataFrame:
    data = data[~data[useful_features].duplicated(keep='first')]
    data = data.reset_index(drop=True)
    return data


### Dataset Split

In [94]:
#### Dataset Split function

def dataset_train_test_split(
    data: pd.DataFrame,
    test_size: float = 0.33,
    random_state: int = 42) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    features, target = data.drop(columns=[label_col]), data[label_col]
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=test_size, random_state=random_state)
    return x_train[useful_features], x_test[useful_features],y_train, y_test 
    

### Preprocessing function steps

In [95]:
#### Standard scaling function

def scaler_func(data: pd.DataFrame)-> pd.DataFrame:
    continuous_columns = df[useful_features].select_dtypes(include='number').columns
    scaled_columns = loaded_scaler.transform(data[continuous_columns])
    continuous_features_df = pd.DataFrame(data=scaled_columns, columns=continuous_columns)
    return continuous_features_df


In [96]:
#### One hot encoding function

def encoder_func(data: pd.DataFrame)-> pd.DataFrame:
    categorical_columns = df[useful_features].select_dtypes(include='object').columns
    categorical_columns_list = categorical_columns.tolist()
    labels = loaded_encoder.get_feature_names_out(categorical_columns_list)
    encoded_data = loaded_encoder.transform(data[categorical_columns_list]).toarray()
    categorical_features_df = pd.DataFrame(data=encoded_data, columns=labels)
    return categorical_features_df


In [97]:
#### Function to join continuous scaled and categorical scaled datasets

def join_df(df1: pd.DataFrame, df2: pd.DataFrame)->pd.DataFrame:
    final_df = df1.join(df2)
    return final_df

In [98]:
#### Preprocessing function which scales, encodes and rejoins the dataset

def preprocessor_func(data: pd.DataFrame)-> pd.DataFrame:
    continuous_features_df = scaler_func(data)
    categorical_features_df = encoder_func(data)
    final_df = join_df(continuous_features_df, categorical_features_df)
    return final_df    



## Model evaluation

In [99]:
#### model evaluation function

def compute_rmse(
        y_test: np.ndarray, 
        y_pred: np.ndarray, 
        precision: int = 2
        ) -> float:
    rmse = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmse, precision)


In [102]:
#### Function for the model training part

def build_model(data: pd.DataFrame):
    df = drop_duplicates(data)
    X_train, X_test,y_train, y_test = dataset_train_test_split(df)
    final_train_df = preprocessor_func(X_train)
    final_test_df = preprocessor_func(X_test)
    y_pred = loaded_model.predict(final_test_df)
    y_pred[y_pred < 0] = 0
    rmse = compute_rmse(y_test, y_pred)
    return {'rmse': rmse}


In [103]:
rmse_score = build_model(df)
rmse_score

{'rmse': 0.54}

# Model Inference

In [49]:
TEST_DATASET_PATH = DATA_PATH + 'test.csv'

In [50]:
df_test_loaded = pd.read_csv(TEST_DATASET_PATH)
df_test_loaded

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [83]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    df_test = input_data[useful_features]
    df_test = drop_duplicates(df_test)
    df_test = df_test.dropna()
    final_df = preprocessor_func(df_test)
    return loaded_model.predict(final_df)


In [86]:
y_final = make_predictions(df_test_loaded)
y_final

array([116745.48155931, 210038.65668825, 164615.56463793, ...,
       192045.575932  , 159962.4757219 , 209724.41166686])

## Testing dataframe equality

In [74]:
!ls ../data/house-prices/

processed_test_df.parquet
processed_train_df.parquet
test.csv
train.csv


In [73]:
#dataframe_test.to_parquet(DATA_PATH + 'processed_test_df.parquet', index=False)

In [59]:
processed_test_df = pd.read_parquet(DATA_PATH + 'processed_test_df.parquet')
processed_test_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.9896,0.348727,1.660258,-0.694582,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.376869,2.358735,1.660258,0.400581,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.376869,0.920745,1.660258,-0.613646,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.235863,2.09656,1.660258,-0.618705,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.9896,-0.76353,1.660258,0.276648,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [78]:
#pd.testing.assert_frame_equal(processed_test_df, df_final)