In [1]:
import os
import platform
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# Check which platform is running the notebook
if platform.system() == 'Windows':
    PROJECT_PATH = "\\".join(os.getcwd().split('\\')[:-1])
else:
    # Assuming a Unix based platform
    PROJECT_PATH = "/".join(os.getcwd().split('/')[:-1])

DATA_PATH = os.path.join(PROJECT_PATH, 'data')
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'train.csv')

train_data = pd.read_csv(TRAIN_DATA_PATH)
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Training & Testing Sets

In [2]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42
target_label = 'SalePrice'

X = train_data.drop(target_label, axis=1)
y = train_data[target_label]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=0.3)
print('Train Size: ', X_train.shape)
print('Test Size: ', X_test.shape)

Train Size:  (1022, 80)
Test Size:  (438, 80)


In [3]:
# Create a copy to manipulate
X_train_df = X_train.copy()
X_train_df = X_train_df.drop('Id', axis=1)

In [4]:
from utils.core import *

X_train_df = remove_missing_features(X_train_df, verbose=True)

Alley         93.542%
PoolQC        99.511%
Fence         80.235%
MiscFeature   96.086%


## Numeric Features

In [5]:
numeric_features = X_train_df.select_dtypes(['int64', 'float64'])
numeric_cols = numeric_features.columns.tolist()
numeric_features.shape

(1022, 36)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

numeric_pipeline = ColumnTransformer([
    ('process', Pipeline([
        ('impute', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ]), numeric_cols)
])

In [10]:
from sklearn.ensemble import RandomForestRegressor

numeric_training = numeric_pipeline.fit_transform(numeric_features)

rand_forest = RandomForestRegressor(random_state=RANDOM_STATE)
scores = cross_val_regression(rand_forest, numeric_training, y_train)
print('Numeric Random Forest RMSE: {:.6f}'.format(scores['rmse']))
print('Numeric Random Forest RMSLE: {:6f}'.format(scores['rmlse']))

Numeric Random Forest RMSE: 31600.591887
Numeric Random Forest RMSLE: 0.153450


In [30]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer

class Scalers(object):
    @staticmethod
    def standard_scaler(data):
        scaler = StandardScaler()
        return scaler.fit_transform(data)
    
    @staticmethod
    def min_max_scaler(data):
        scaler = MinMaxScaler()
        return scaler.fit_transform(data)
    
    @staticmethod
    def log_scaler(data):
        scaler = FunctionTransformer(np.log1p)
        return scaler.fit_transform(data)
    
    def apply_scalers(data):
        return {
            'standard': Scalers.standard_scaler(data),
            'min_max': Scalers.min_max_scaler(data),
            'log': Scalers.log_scaler(data)
        }

In [40]:
from sklearn.feature_selection import SelectKBest, f_regression

k_best_numeric = SelectKBest(score_func=f_regression, k=15)
k_best_numeric_data = k_best_numeric.fit_transform(numeric_training, y_train)

In [42]:
scores = cross_val_regression(rand_forest, k_best_numeric_data, y_train)
print('Numeric Random Forest k=15 RMSE: {:.6f}'.format(scores['rmse']))
print('Numeric Random Forest k=15 RMSLE: {:6f}'.format(scores['rmlse']))

Numeric Random Forest k=15 RMSE: 31748.538837
Numeric Random Forest k=15 RMSLE: 0.157814


From choosing the best 15 features, the performance of the model has slighly decreased. Trying PCA on the all the numeric values to see if the same scores are maintained

In [47]:
from sklearn.decomposition import PCA

pca = PCA(random_state=42, n_components=0.99)
pca_numeric = pca.fit_transform(numeric_training)

In [48]:
scores = cross_val_regression(rand_forest, pca_numeric, y_train)
print('Numeric Random Forest PCA RMSE: {:.6f}'.format(scores['rmse']))
print('Numeric Random Forest PCA RMSLE: {:6f}'.format(scores['rmlse']))

Numeric Random Forest PCA RMSE: 36132.672838
Numeric Random Forest PCA RMSLE: 0.170683
