In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import make_scorer
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

In [2]:
train = pd.read_csv('~/Earlygithub/dsp-oluwatimileyin_victor-adedigba/data/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('~/Earlygithub/dsp-oluwatimileyin_victor-adedigba/data/house-prices-advanced-regression-techniques/test.csv')

In [3]:
train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [3]:
feature_list =  ['Id', 'LotArea', 'YearBuilt', 'BsmtFinSF1', 'BedroomAbvGr', 
                 'KitchenAbvGr', 'GarageArea',
                 '1stFlrSF', 'MSZoning', 'Heating']
target  = ['SalePrice']

In [26]:
#Train data
x_train = train[feature_list]

ytrain = train[target]

In [10]:
#Test data
X_test = test[feature_list]

In [27]:
x_train.duplicated().sum()

0

In [28]:
x_train.isnull().sum()

Id              0
LotArea         0
YearBuilt       0
BsmtFinSF1      0
BedroomAbvGr    0
KitchenAbvGr    0
GarageArea      0
1stFlrSF        0
MSZoning        0
Heating         0
dtype: int64

In [29]:
Xtrain = x_train.drop(['MSZoning', 'Heating'], axis = 1)

In [14]:
Xtrain.corr()

Unnamed: 0,Id,LotArea,YearBuilt,BsmtFinSF1,BedroomAbvGr,KitchenAbvGr,GarageArea,1stFlrSF
Id,1.0,-0.033226,-0.012713,-0.005024,0.037719,0.002951,0.017634,0.010496
LotArea,-0.033226,1.0,0.014228,0.214103,0.11969,-0.017784,0.180403,0.299475
YearBuilt,-0.012713,0.014228,1.0,0.249503,-0.070651,-0.1748,0.478954,0.281986
BsmtFinSF1,-0.005024,0.214103,0.249503,1.0,-0.107355,-0.081007,0.29697,0.445863
BedroomAbvGr,0.037719,0.11969,-0.070651,-0.107355,1.0,0.198597,0.065253,0.127401
KitchenAbvGr,0.002951,-0.017784,-0.1748,-0.081007,0.198597,1.0,-0.064433,0.068101
GarageArea,0.017634,0.180403,0.478954,0.29697,0.065253,-0.064433,1.0,0.489782
1stFlrSF,0.010496,0.299475,0.281986,0.445863,0.127401,0.068101,0.489782,1.0


In [15]:
Xtrain.duplicated().sum()

0

In [16]:
Xtrain

Unnamed: 0,Id,LotArea,YearBuilt,BsmtFinSF1,BedroomAbvGr,KitchenAbvGr,GarageArea,1stFlrSF
0,1,8450,2003,706,3,1,548,856
1,2,9600,1976,978,3,1,460,1262
2,3,11250,2001,486,3,1,608,920
3,4,9550,1915,216,3,1,642,961
4,5,14260,2000,655,4,1,836,1145
...,...,...,...,...,...,...,...,...
1455,1456,7917,1999,0,3,1,460,953
1456,1457,13175,1978,790,3,1,500,2073
1457,1458,9042,1941,275,4,1,252,1188
1458,1459,9717,1950,49,2,1,240,1078


In [17]:
# Define the preprocessing steps for continuous and categorical features
numeric_features = ['Id', 'LotArea', 'YearBuilt', 
                 'BsmtFinSF1', 'BedroomAbvGr',
                    'KitchenAbvGr', 'GarageArea','1stFlrSF']
categorical_features = ['MSZoning', 'Heating']

In [31]:
print(x_train.shape)
print(ytrain.shape)


(1460, 10)
(1460, 1)


In [32]:
X_train, X_val, y_train, y_val = train_test_split(x_train, ytrain, test_size=0.2, random_state=42)

In [33]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # To Scale numeric features
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())  # Encoding the categorical features
])

# Combining the transformers for both types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [34]:
# Create the full modeling pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', HistGradientBoostingRegressor())
])


In [35]:
# Fit the model to the training data
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [36]:
# Make predictions on the test set
y_pred = model.predict(X_val)

In [37]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [38]:
compute_rmsle(y_val, y_pred)

0.2

In [40]:
processed_df = pd.concat([x_train, ytrain], axis=1)

In [41]:
processed_df = pd.DataFrame(processed_df)

In [42]:
processed_df.to_parquet('../data/processed_df.parquet', index=False)

In [43]:
correctly_processed_df = pd.read_parquet('../data/processed_df.parquet')

In [44]:
pd.testing.assert_frame_equal(processed_df, correctly_processed_df)