<a href="https://www.kaggle.com/code/vicmangiltafolla/sklearn-ml-vs-keras-dnn-basic-comparison?scriptVersionId=145051388" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# House Princes

## Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt #data visualization
import math #mathematical operations
import seaborn as sns #prettier data visualization
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler #Scal numerical values
from sklearn.model_selection import train_test_split, GridSearchCV #Divide dataset
from sklearn.linear_model import LinearRegression, RidgeCV, SGDRegressor #Linear Regressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, VotingRegressor #Ensembre Regressor
import keras #Library for DNNs
keras.utils.set_random_seed(42) #Set seed to minimice randomness
from tensorflow.keras.models import Sequential #Build the Neural Network
from keras import optimizers, layers
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score #Metrics for evaluating the models

## Functions

In [None]:
def score_model(model, X_train, X_val, Y_train, Y_val, model_name):
    '''
    Evaluates a model with training and validation sets 
    in 4 different metrics. 
    Returns a DataFrame with the results
    '''
    Y_pred = model.predict(X_train)
    train_mse = mean_squared_error(Y_train, Y_pred)
    train_mae = mean_absolute_error(Y_train, Y_pred)
    train_mape = mean_absolute_percentage_error(Y_train, Y_pred)
    train_r2 = r2_score(Y_train, Y_pred)
    print("Training scores: ")
    print('\t-MSE: {:.2f}'.format(train_mse))
    print('\t-MAE: {:.2f}'.format(train_mae))
    print('\t-MAPE: {:.2f}%'.format(train_mape*100))
    print('\t-R2 score: {:.2f}'.format(train_r2))
    print()
    Y_pred = model.predict(X_val)
    val_mse = mean_squared_error(Y_val, Y_pred)
    val_mae = mean_absolute_error(Y_val, Y_pred)
    val_mape = mean_absolute_percentage_error(Y_val, Y_pred)    
    val_r2 = r2_score(Y_val, Y_pred)
    print("Validation scores: ")
    print('\t-MSE: {:.2f}'.format(val_mse))
    print('\t-MAE: {:.2f}'.format(val_mae))
    print('\t-MAPE: {:.2f}%'.format(val_mape*100))    
    print('\t-R2 score: {:.2f}'.format(val_r2))
    return pd.DataFrame({'model' : [model_name for i in range(8)], 'metric': ['mse', 'mae', 'mape', 'r2', 'mse', 'mae', 'mape', 'r2'], 
            'value' :[train_mse, train_mae, train_mape, train_r2, val_mse, val_mae, val_mape, val_r2], 
            'subset' : ['train', 'train' ,'train', 'train' ,'val', 'val' ,'val', 'val']})
        
    
def plot_scores(metric):
    '''
    Plots a certain metric from the DataFrame with the scores
    '''
    plt.figure(figsize=(20,10))
    plt.grid(True)
    sns.barplot(y='value', x='model', hue='subset', data=df_scores.loc[df_scores['metric'] == metric]).set(title=metric.upper())
    
def make_submission(model, test_set):
    '''
    Create prediction in the test set and a csv file to upload to competition
    '''
    pred = model.predict(test_set)
    ids = df_test['Id'].copy()
    df_pred = pd.DataFrame(pred, columns = ['SalePrice'])
    df_pred['Id'] = ids
    model_name = str(model)[:23].strip('<')
    df_pred.to_csv('/kaggle/working/submission.csv' ,index=False)

class CustomVoteRegressor():
    '''
    A basic Vote Regressor where the final prediction is the mean of 
    the prediction of all estimators. 
    Supports combining Keras models and Sklearn models
    '''
    def __init__(self, estimators):
        self.estimators = estimators
        self.n_estimators = len(estimators)
    
    def predict(self, X, Y=None):
        prediction = self.estimators[0].predict(X).reshape(-1,1)
        for estimator in self.estimators[1:self.n_estimators]:
            prediction += estimator.predict(X).reshape(-1,1)
        prediction /= self.n_estimators
        return prediction

In [None]:
#Read data
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
#Show first 5 rows
df_train.head()

In [None]:
#Display general info about the data
df_train.info()

In [None]:
#Show first 5 rows
df_test.head()

In [None]:
#Display general info about the data
df_test.info()

As we can see, there're some features with a lot of NaN values, so it's better to delete them from the dataset

In [None]:
#Select colummns that have more that 1000 real values
df_train=df_train.loc[:,df_train.count() > 1000]
df_test = df_test.loc[:, df_test.count() > 1000]

To simplify the analisis, we will divide our features into categorical and numeric 

# Numerical features

In [None]:
#Select features that contain numbers
df_train_num = df_train.select_dtypes(include=['int64', 'float64'])
df_test_num = df_test.select_dtypes(include=['int64', 'float64'])

In [None]:
#Show first 5 rows
df_train_num.head()

In [None]:
#Show some basic statistics 
df_train_num.describe()

In [None]:
#See which features have a correlation greater than 0.2 with the SalePrice column, with exception of that column itself
important_num = df_train_num.corr().loc['SalePrice', (abs(df_train_num.corr()).SalePrice > 0.2)& (df_train_num.corr().SalePrice != 1)]
important_num

In [None]:
#Plot the correlation 
for feature in important_num.index: 
    plt.figure()
    sns.regression.regplot(x = df_train_num[feature], y = df_train['SalePrice'], marker='+')
    plt.title(feature + ' vs SalePrice')

In [None]:
#Show the distribution of the values 
df_train_num[important_num.index].hist(figsize=(16,20), bins=50)

In [None]:
#Display general info about the data
df_train_num[important_num.index].info()

The Null values can't be processed by most of the Machine Learning algorithms, so we need to replace them.

In [None]:
#Use Nan Euclidean distance 
knn_imputer = KNNImputer()
df_train_num = pd.DataFrame(knn_imputer.fit_transform(df_train_num[important_num.index]), columns=important_num.index)
df_test_num = pd.DataFrame(knn_imputer.transform(df_test_num[important_num.index]), columns=important_num.index)
df_train_num.info()

In [None]:
#Scal the numerical values between 1 and 0 
scaler = MinMaxScaler()
df_train_num_scal = pd.DataFrame(scaler.fit_transform(df_train_num[important_num.index]), columns=important_num.index)
df_test_num_scal = pd.DataFrame(scaler.transform(df_test_num[important_num.index]),  columns=important_num.index)

In [None]:
#Show first 5 rows
df_train_num_scal.head()

# Categorical features

In [None]:
#Select features that contain text, or in this case that contain categories
df_train_cat = df_train.select_dtypes(include=['object'])
df_test_cat = df_test.select_dtypes(include=['object'])

In [None]:
#Display general info about the data
df_train_cat.info()

There're are a lot of strategies to encode categorical values to turn them into numeric, in this case we will use OneHotEncoding in in the get_dummies method is the Pandas library

In [None]:
# Concatenate the test and train sets to maintain the encoding 
len_test = len(df_test)
df_cat = pd.concat([df_train_cat, df_test_cat], axis=0)
df_cat = pd.get_dummies(df_cat)
df_train_cat_prep = df_cat[:-len_test]
df_test_cat_prep = df_cat[-len_test:]

As we can see, there are too many features and not all of them are as useful; so we will use the same strategy as in the numerical features to find the features with the greatest correlation with SalePrice

In [None]:
#See which features have a correlation greater than 0.3 with the SalePrice column, with exception of that column itself
df_train_cat_prep_ = df_train_cat_prep.copy()
df_train_cat_prep_['SalePrice'] = df_train['SalePrice']
important_cat = df_train_cat_prep_.corr().loc['SalePrice', (abs(df_train_cat_prep_.corr()).SalePrice > 0.2)& (df_train_cat_prep_.corr().SalePrice != 1)]
important_cat

In [None]:
#Plot the correlation 
for feature in important_cat.index: 
    plt.figure()
    sns.histplot(x = df_train_cat_prep_[feature], y = df_train['SalePrice'])
    plt.title(feature + ' vs SalePrice')

In [None]:
#Select the features with a major correlation with SalePrice
df_train_cat_prep = df_train_cat_prep[important_cat.index].copy()
df_test_cat_prep = df_test_cat_prep[important_cat.index].copy()
df_train_cat_prep

In [None]:
df_train_cat_prep.info()

# Divide Dataset

Now, concatenate the DataFrames 

In [None]:
X = pd.concat([df_train_cat_prep, df_train_num_scal],axis=1)
Y = df_train['SalePrice'].copy()
X_test = pd.concat([df_test_cat_prep, df_test_num_scal],axis=1)
X

In [None]:
#Divide the training into training and validation to score the model and prevent it from overfitting
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=42)

# Model Selection

Now, it's time to try different models to compare them and choose the best one

First, start with some basic Sklearn algorithms.
I've applied hiperparam tunning to get the most out of each model and improve results.

## Linear Models

In [None]:
#Linear Regression
reg_linear = LinearRegression()
reg_linear.fit(X_train, Y_train)
scores_linear = score_model(reg_linear, X_train, X_val, Y_train, Y_val, 'Linear')

In [None]:
#Ridge CV
reg_ridge_cv = RidgeCV(alphas = (0.5, 1, 10), 
                       fit_intercept=False, 
                       alpha_per_target=True)
reg_ridge_cv.fit(X_train, Y_train)
scores_ridge = score_model(reg_ridge_cv, X_train, X_val, Y_train, Y_val, 'Ridge CV')

In [None]:
#SGD Regressor
reg_sgd = SGDRegressor(epsilon=5, 
                       eta0=5, 
                       learning_rate='constant', 
                       loss='huber', 
                       penalty=None, 
                       random_state=42)
reg_sgd.fit(X_train, Y_train)
scores_sgd = score_model(reg_sgd, X_train, X_val, Y_train, Y_val, 'SGD')

## Ensemble Models

Ensemble models basically train a certain number of models and the final prediction is based up in every singular prediction made by the individual models. Therefore tend to have better performance but also are more expensive computationally and overfit easily. 

In [None]:
#Random Forest Regressor
reg_rndf = RandomForestRegressor(n_estimators=75, 
                                 criterion='friedman_mse', 
                                 max_depth=16, 
                                 max_features='sqrt', 
                                 min_samples_leaf=2,
                                 random_state=42)
reg_rndf.fit(X_train, Y_train)
scores_rndf = score_model(reg_rndf, X_train, X_val, Y_train, Y_val, 'Rnd Forest')

### VotingRegressor

In [None]:
#Takes the prediction of the regressor and outputs the mean
reg_vote = VotingRegressor(estimators=[('Linear Reg', reg_linear),
                                       ('Ridge Reg', reg_ridge_cv), 
                                       ('SGD Reg', reg_sgd), 
                                       ('Rnd Forest Reg', reg_rndf)])
reg_vote.fit(X_train, Y_train)
scores_vote = score_model(reg_vote, X_train, X_val, Y_train, Y_val, 'Voter')

## Deep Neural Networks

Now it's time for the DNNs, different basic architectures base up number of neurons or layers are going to be trained.

Also, the 'batch_size' and 'epochs' number will change in each model in order to see how does it change the training process. 

### 1.0

In the first network we can see how the loss and error lowers quickly, this is mainly because of the small batch_size. 

A batch_size = 1 means that the newtork will update its parameters for each sample in the dataset, therefore it takes longer for each epoch to complete but it can arrive to a descent result with less of them. 

In [None]:
#Build Model
network1 = Sequential()
network1.add(layers.Dense(300, activation='relu', input_shape=[len(X_train.keys())]))
network1.add(layers.Dense(250, activation='relu'))
network1.add(layers.Dense(200, activation='relu'))
network1.add(layers.Dense(150, activation='relu'))
network1.add(layers.Dense(100, activation='relu'))
network1.add(layers.Dense(50, activation='relu'))
network1.add(layers.Dense(1))

network1.compile( 
    optimizer=optimizers.Adam(), 
    loss='mean_squared_error', 
    metrics=['mean_absolute_error', 'mean_absolute_percentage_error'], 
)
history = network1.fit(x=np.asarray(X_train).astype('float32'), y=Y_train,
         batch_size = 1, 
         epochs = 15, 
         validation_data = (np.asarray(X_val).astype('float32'), Y_val), 
         shuffle=False) 

In [None]:
scores_nn1 = score_model(network1, np.asarray(X_train).astype('float32'), 
                         np.asarray(X_val).astype('float32'), Y_train, Y_val, 'DNN 1')

In [None]:
pd.DataFrame(history.history)[['mean_absolute_error', 'val_mean_absolute_error']].plot(figsize=(20,10), ylabel='MAE', xlabel='Epochs', grid=True)

In [None]:
pd.DataFrame(history.history)[['mean_absolute_percentage_error', 'val_mean_absolute_percentage_error']].plot(figsize=(20,10), ylabel='MAPE', xlabel='Epochs', grid=True)

### 2.0

Increasing the batch_size speeds up the training process but decreases the performance of each epoch. 

It's also importat to notice that in the past graph we are still not that close to overfit the model, which starts to happen in this case even though in a small amount

In [None]:
#Build Model

network2 = Sequential()
network2.add(layers.Dense(400, activation='relu', input_shape=[len(X_train.keys())]))
network2.add(layers.Dense(200, activation='relu'))
network2.add(layers.Dense(100, activation='relu'))
network2.add(layers.Dense(50, activation='relu'))
network2.add(layers.Dense(25, activation='relu'))
network2.add(layers.Dense(10, activation='relu'))
network2.add(layers.Dense(5, activation='relu'))
network2.add(layers.Dense(1))

network2.compile( 
    optimizer=optimizers.Adam(), 
    loss='mean_squared_error', 
    metrics=['mean_absolute_error', 'mean_absolute_percentage_error'], 
)
history = network2.fit(x=np.asarray(X_train).astype('float32'), y=Y_train,
         batch_size = 5, 
         epochs = 20, 
         validation_data = (np.asarray(X_val).astype('float32'), Y_val), 
         shuffle=False) 

In [None]:
scores_nn2 = score_model(network2, np.asarray(X_train).astype('float32'), 
                         np.asarray(X_val).astype('float32'), Y_train, Y_val, 'DNN 2')

In [None]:
pd.DataFrame(history.history)[['mean_absolute_error', 'val_mean_absolute_error']].plot(figsize=(20,10), ylabel='MAE', xlabel='Epochs', grid=True)

In [None]:
pd.DataFrame(history.history)[['mean_absolute_percentage_error', 'val_mean_absolute_percentage_error']].plot(figsize=(20,10), ylabel='MAPE', xlabel='Epochs', grid=True)

### 3.0

The behavior continues and we need to increase the number of epochs in order to get similar results as in the first network, we also see a slightly more agressive overfitting that starts earlier in the training

In [None]:
#Build Model

network3 = Sequential()
network3.add(layers.Dense(400, activation='relu', input_shape=[len(X_train.keys())]))
network3.add(layers.Dense(400, activation='relu'))
network3.add(layers.Dense(400, activation='relu'))
network3.add(layers.Dense(400, activation='relu'))
network3.add(layers.Dense(400, activation='relu'))
network3.add(layers.Dense(1))

network3.compile( 
    optimizer=optimizers.Adam(), 
    loss='mean_squared_error', 
    metrics=['mean_absolute_error', 'mean_absolute_percentage_error'], 
)
history = network3.fit(x=np.asarray(X_train).astype('float32'), y=Y_train,
         batch_size = 10, 
         epochs = 22, 
         validation_data = (np.asarray(X_val).astype('float32'), Y_val), 
         shuffle=False) 

In [None]:
scores_nn3 = score_model(network3, np.asarray(X_train).astype('float32'), 
                         np.asarray(X_val).astype('float32'), 
                         Y_train, Y_val, 'DNN 3')

In [None]:
pd.DataFrame(history.history)[['mean_absolute_error', 'val_mean_absolute_error']].plot(figsize=(20,10), ylabel='MAE', xlabel='Epochs', grid=True)

In [None]:
pd.DataFrame(history.history)[['mean_absolute_percentage_error', 'val_mean_absolute_percentage_error']].plot(figsize=(20,10), ylabel='MAPE', xlabel='Epochs', grid=True)

### 4.0

In [None]:
#Build Model

network4 = Sequential()
network4.add(layers.Dense(200, activation='relu', input_shape=[len(X_train.keys())]))
network4.add(layers.Dense(200, activation='relu'))
network4.add(layers.Dense(200, activation='relu'))
network4.add(layers.Dense(200, activation='relu'))
network4.add(layers.Dense(200, activation='relu'))
network4.add(layers.Dense(1))

network4.compile( 
    optimizer=optimizers.Adam(), 
    loss='mean_squared_error', 
    metrics=['mean_absolute_error', 'mean_absolute_percentage_error'], 
)
history = network4.fit(x=np.asarray(X_train).astype('float32'), y=Y_train,
         batch_size = 30, 
         epochs = 40, 
         validation_data = (np.asarray(X_val).astype('float32'), Y_val), 
         shuffle=False) 

In [None]:
scores_nn4 = score_model(network4, np.asarray(X_train).astype('float32'), 
                         np.asarray(X_val).astype('float32'), 
                         Y_train, Y_val, 'DNN 4')

In [None]:
pd.DataFrame(history.history)[['mean_absolute_error', 'val_mean_absolute_error']].plot(figsize=(20,10), ylabel='MAE', xlabel='Epochs', grid=True)

In [None]:
pd.DataFrame(history.history)[['mean_absolute_percentage_error', 'val_mean_absolute_percentage_error']].plot(figsize=(20,10), ylabel='MAPE', xlabel='Epochs', grid=True)

### Voting Regressor with DNN

The VotingRegressor algorithm from sklearn comes very handy will ML models, however it has some trouble with the Keras Sequential models

This is where the CustomVoteRegressor class it's helpful, a really basic class that return the mean of the prediction of their models

In [None]:
nn_vote = CustomVoteRegressor([network1, network2, network3, network4])
scores_vote_nn = score_model(nn_vote, np.asarray(X_train).astype('float32'), 
                             np.asarray(X_val).astype('float32'), 
                             Y_train, Y_val, 'Vote DNN')

## Combining Machine and Deep Learning

Because of the simplicity of the anterior class, we can use to combine both sklearn ML and Keras DNN models

In [None]:
final_vote = CustomVoteRegressor([reg_vote, nn_vote])
scores_final = score_model(final_vote, np.asarray(X_train).astype('float32'), 
                           np.asarray(X_val).astype('float32'), 
                           Y_train, Y_val, 'Final Vote')

# Model comparison 

Saving the score in a DataFrame allow to plot easily the scores of each metric, and compare the performance of each model looking for improving areas and contrast

In [None]:
scores = [scores_linear, scores_ridge, scores_sgd, scores_rndf, scores_vote, scores_nn1, scores_nn2, scores_nn3, scores_nn4, scores_vote_nn, scores_final]
df_scores = pd.concat(scores)

In [None]:
plot_scores('mse')

In [None]:
plot_scores('mae')

In [None]:
plot_scores('mape')

In [None]:
plot_scores('r2')

# Submission

In [None]:
make_submission(reg_vote, X_test)

Thaks for watching!

If you find helpful this notebook please upvote and comment your thoughts 

Any feedback is welcome