the notebook to explore different normalization method to the final learning result

## Data Analysis

I use the data from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data) to explore the potential problems I have mentioned


In [13]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

# diaplay all the columns
pd.pandas.set_option("display.max_columns", None)

data = pd.read_csv("train.csv")

data.drop("Id", axis=1, inplace=True)

In [11]:
# get the prediction target
data['SalePrice']

# get the categorical and numerical featuers
car_feas = [fea for fea in data.columns if data[fea].dtype == 'O']
## add MSSubClass to cate features
car_feas = car_feas + ['MSSubClass']
## cast all variables as categorical
data[car_feas] = data[car_feas].astype('O')

num_feas = [fea for fea in data.columns if fea not in car_feas and fea not in ["SalePrice"]]


## Feature Engineering


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
import joblib


X_train,X_test,y_train, y_test = train_test_split(
    data.drop([ 'SalePrice'],axis=1),
    data['SalePrice'],
    test_size=0.1,
    random_state=0 
)

# handle skewed distribution in prediction matrix
y_train = np.log(y_train)
y_test = np.log(y_test)

### processing categorical variables

replace missing value with the string 'missing' or the most frequent category

In [15]:
# find features with missing values
car_feas_with_na = [
    fea for fea in car_feas
    if X_train[fea].isnull().sum()>0
]

In [16]:
# find the features with high ratio with missing values -- replace with missing
fea_str_missing = [
    fea for fea in car_feas_with_na if X_train[fea].isnull().mean() > 0.1
]

# find the features with low ratio with missing values --- replace with most frequent category
fea_str_category = [
    fea for fea in car_feas_with_na if X_train[fea].isnull().mean() < 0.1
]

In [17]:
# replace missing values with string "Missing"
X_train[fea_str_missing] = X_train[fea_str_missing].fillna("Missing")
X_test[fea_str_missing] = X_test[fea_str_missing].fillna("Missing")

In [18]:
for fea in fea_str_category:
    # The mode of a set of values is the value that appears most often
    mode = X_train[fea].mode()[0]
    X_train[fea].fillna(mode, inplace=True)
    X_test[fea].fillna(mode, inplace=True)

### processing numeric variable

replace missing values with mean

In [19]:
for fea in num_feas:
    mean_val = data[fea].mean()
    # add binary missing indicator
    X_train[fea+'_na'] = np.where(X_train[fea].isnull(), 1, 0)
    X_test[fea+'_na'] = np.where(X_test[fea].isnull(), 1, 0)

    # replace missing values by the mean
    X_train[fea].fillna(mean_val, inplace=True)
    X_test[fea].fillna(mean_val, inplace=True)
    

numerical variable transformation

In [20]:
# logarithmic transformation ---- work only on positive numerical
for var in ["LotFrontage", "1stFlrSF", "GrLivArea"]:
    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

# Yeo-Johnson transformation
X_train['LotArea'], param = stats.yeojohnson(X_train['LotArea'])
X_test['LotArea'] = stats.yeojohnson(X_test['LotArea'], lmbda=param)

# for very skewed variables, using Binary transformation
skewed = [
    'BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'MiscVal'
]

for var in skewed:
    # map the variable values into 0 and 1
    X_train[var] = np.where(X_train[var]==0, 0, 1)
    X_test[var] = np.where(X_test[var]==0, 0, 1)

  loglike = -n_samples / 2 * np.log(trans.var(axis=0))
  w = xb - ((xb - xc) * tmp2 - (xb - xa) * tmp1) / denom
  tmp1 = (x - w) * (fx - fv)
  tmp2 = (x - v) * (fx - fw)


### Categorical Features Mappings

the values in some features have an assigned order, the mapping is used here to convert them into numeric data 

In [54]:
qual_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0, np.NaN: 0}

qual_feas = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
             'HeatingQC', 'KitchenQual', 'FireplaceQu',
             'GarageQual', 'GarageCond',
            ]

for fea in qual_feas:
    X_train[fea] = X_train[fea].map(qual_mappings)
    X_test[fea] = X_test[fea].map(qual_mappings)

exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, np.NaN: 0}

X_train['BsmtExposure'] = X_train['BsmtExposure'].map(exposure_mappings)
X_test['BsmtExposure'] = X_test['BsmtExposure'].map(exposure_mappings)

finish_mappings = {'Missing': 0, np.NaN: 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}

finish_vars = ['BsmtFinType1', 'BsmtFinType2']

for var in finish_vars:
    X_train[var] = X_train[var].map(finish_mappings)
    X_test[var] = X_test[var].map(finish_mappings)

garage_mappings = {'Missing': 0, np.NaN: 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

X_train['GarageFinish'] = X_train['GarageFinish'].map(garage_mappings)
X_test['GarageFinish'] = X_test['GarageFinish'].map(garage_mappings)

fence_mappings = {'Missing': 0, np.NaN: 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}

X_train['Fence'] = X_train['Fence'].map(fence_mappings)
X_test['Fence'] = X_test['Fence'].map(fence_mappings)


### Encoding of categorical variables



In [55]:
qual_vars  = qual_feas + finish_vars + ['BsmtExposure','GarageFinish','Fence']

# capture the remaining categorical variables
# (those that we did not re-map)

cat_others = [
    var for var in car_feas if var not in qual_feas
]

def replace_categories(train, test, y_train, var, target):
    
    tmp = pd.concat([X_train, y_train], axis=1)
    
    # order the categories in a variable from that with the lowest
    # house sale price, to that with the highest
    ordered_labels = tmp.groupby([var])[target].mean().sort_values().index

    # create a dictionary of ordered categories to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}
    
    # use the dictionary to replace the categorical strings by integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

for var in cat_others:
    replace_categories(X_train, X_test, y_train, var, 'SalePrice')

In [60]:
right_num_feas = [var for var in X_train.columns if X_train[var].isnull().sum() == 0]

### Feature Scaling

*converting in the range [0,1] works particularly 
well if you are dealing with a sparse matrix and most of your values are zero* ---- Large Scale Machine Learning with Python

I will compare four different methods here to explain the difference of them.

[Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer)     ---- normalize samples individually to unit norm, without inverse_transform

[MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler)   ---- transform features by scaling each feature to a given range, with inverse_transform

[RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)   ---- Scale features using statistics that are robust to outliers, with inverse_transform

[StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler) ---- Standardize features by removing the mean and scaling to unit variance, with inverse_transform


In [90]:
X_train = X_train[right_num_feas]
X_test = X_test[right_num_feas]


# test MinMaxScaler
# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = RobustScaler()
scaler = Normalizer()
#  fit  the scaler to the train set
scaler.fit(X_train) 

# transform the train and test set

# sklearn returns numpy arrays, so we wrap the
# array with a pandas dataframe

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [91]:
X_train.to_csv('xtrain.csv', index=False)
X_test.to_csv('xtest.csv', index=False)

y_train.to_csv('ytrain.csv', index=False)
y_test.to_csv('ytest.csv', index=False)

## Feature Selection


In [92]:
# from sklearn.linear_model import Lasso
# from sklearn.feature_selection import SelectFromModel

# X_train


In [93]:
# X_train = pd.read_csv('xtrain.csv')
# X_test = pd.read_csv('xtest.csv')

# y_train = pd.read_csv('ytrain.csv')
# y_test = pd.read_csv('ytest.csv')

# sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))
# sel_.fit(X_train, y_train)

## Model training

In [94]:
# to build the model
import pandas as pd
# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor

X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')

y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')


In [95]:
lin_model = Lasso(alpha=0.001, random_state=0)

# train the model

lin_model.fit(X_train, y_train)

Lasso(alpha=0.001, random_state=0)

In [96]:
# make predictions for train set
pred = lin_model.predict(X_train)

# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred), squared=False))))
print('train r2: {}'.format(
    r2_score(np.exp(y_train), np.exp(pred))))
print()

# make predictions for test set
pred = lin_model.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred), squared=False))))
print('test r2: {}'.format(
    r2_score(np.exp(y_test), np.exp(pred))))
print()

print('Average house price: ', int(np.exp(y_train).median()))

train mse: 1448379939
train rmse: 38057
train r2: 0.768030278886929

test mse: 2890698512
test rmse: 53765
test r2: 0.5793582628440888

Average house price:  163000


MinMaxScaler:
    
    - train mse: 844795172
    - train rmse: 29065
    - train r2: 0.8646992440387936

    - test mse: 1116674539
    - test rmse: 33416
    - test r2: 0.8375064309369368

    - Average house price:  163000

StandardScaler:

    - train mse: 692046354
    - train rmse: 26306
    - train r2: 0.8891631985314724

    - test mse: 1257957480
    - test rmse: 35467
    - test r2: 0.8169475586394037

    - Average house price:  16300

RobustScaler:

    - train mse: 697763515
    - train rmse: 26415
    - train r2: 0.8882475491126485

    - test mse: 1290265505
    - test rmse: 35920
    - test r2: 0.8122462368902384

    - Average house price:  163000

Normalizer:

    - train mse: 1448379939
    - train rmse: 38057
    - train r2: 0.768030278886929

    - test mse: 2890698512
    - test rmse: 53765
    - test r2: 0.5793582628440888

    - Average house price:  163000