# Machine Learning Shootout

In [1]:
import pandas as pd
import numpy as np
import os
#import seaborn as sns # heatmaps yay

from datetime import datetime

#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from scipy.stats import skew
#from scipy.stats import norm
from scipy.stats import boxcox

from scipy.special import inv_boxcox

import math

#import matplotlib.pyplot as plt
#%matplotlib inline
from tpot import TPOTRegressor
from dask.distributed import Client

In [12]:
client = Client(n_workers=6, threads_per_worker=2)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


In [13]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:34017  Dashboard: http://127.0.0.1:36983/status,Cluster  Workers: 6  Cores: 12  Memory: 33.74 GB


In [14]:
lmbda_opts = {}

In [15]:
def drop_high_missing_features(df):
    tot_rec = len(df.index)
    for col in df.columns.values:
        if df[col].isnull().sum() / tot_rec > 0.15:
            del df[col]

def impute_missing_data(df):
    fill_with = {'None': ['PoolQC', 'MiscFeature', 'Alley', 'Fence',
                          'FireplaceQu', 'GarageType', 'GarageFinish',
                          'GarageQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                          'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSSubClass'],
                 0: ['GarageYrBlt', 'GarageArea', 'GarageCars',
                   'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                   'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath',
                   'MasVnrArea'],
                'Typ': ['Functional']}
    for fw in fill_with:
        for f in fill_with[fw]:
            df[f] = df[f].fillna(fw)
    
    
    #df["Functional"] = df["Functional"].fillna("Typ")
    
    # Fill with the mode
    mode_list = ['SaleType', 'Exterior2nd', 'Exterior1st', 'KitchenQual', 
                 'Electrical', 'MSZoning']
    for f in mode_list:
        df[f] = df[f].fillna(df[f].mode()[0])
    
    
    # Fill the missing lot frontage with the median value for the neighborhood
    df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    return df

def calculate_nulls(df):
    df_nulls = df.copy().isnull().sum()
    df_nulls = df_nulls.to_frame().rename(columns={0:'num_nulls'})
    df_nulls['total_records'] = len(df.index)
    df_nulls['pct_null'] = df_nulls['num_nulls'] / df_nulls['total_records'] * 100
    df_nulls = df_nulls[df_nulls['num_nulls'] > 0].sort_values(by='pct_null', ascending=False)
    return df_nulls

def process_df(df, is_test=False):
    df.set_index('Id')

    df = impute_missing_data(df)
    
    df = df.drop(['Utilities'], axis=1)

    # Convert to strings as they're categories:
    str_cols = ['MSSubClass', 'OverallCond', 'MoSold', 'YrSold', 'YearRemodAdd', 'YearBuilt', 'GarageYrBlt']
    for f in str_cols:
        df[f] = df[f].astype(str)

    ohe_cols = df.dtypes[df.dtypes == "object"].index
    df_dummies = pd.get_dummies(df[ohe_cols])
    df = df.drop(ohe_cols, axis='columns')
    df = pd.concat([df, df_dummies], sort=False, axis='columns')

    # Use a label encoder for the categorical fields
    le_cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
           'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
           'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
           'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
           'MoSold', 'YrSold', 'YearRemodAdd', 'YearBuilt', 'GarageYrBlt']
    for c in le_cols:
        if c in df.columns.values:
            le = LabelEncoder() 
            le.fit(list(df[c].values)) 
            df[c] = le.transform(list(df[c].values))
        
    numeric_feats = df.dtypes[df.dtypes != "object"].index.values        
    df = df.loc[:, numeric_feats]
        
    # Sale Price is in our training data, but not testing data
    shift = 1.0
    # Train data
    if 'SalePrice' in df.columns.values:
        filter = (df['GrLivArea'] > 4000) & (df['SalePrice'] < 200000)
        df = df.loc[~filter, :]

        # Check the skew of all numerical features
        skewed_feats = df[numeric_feats[1:]].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
        skewness = pd.DataFrame({'Skew' :skewed_feats})
        
        for f in skewness[abs(skewness) > 0.75].index:
            if f not in le_cols and '_' not in f: #ignore label encoded fields and OHE columns
                df[f], lmbda_opts[f] = boxcox(df[f] + shift)

    else:  # Test data
        for f in lmbda_opts:
            if f in df.columns:
                df[f] = boxcox(df[f] + shift, lmbda=lmbda_opts[f])
            elif f != 'SalePrice':
                df[f] = 0

    return df

In [16]:
df = pd.read_csv(os.path.join('data','train.csv'))
df = process_df(df)

In [17]:
from sklearn.model_selection import train_test_split

y = df.pop('SalePrice')
X = df

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=8675309)

In [48]:
tp = TPOTRegressor(generations=50, population_size=50, use_dask=True, n_jobs=-1, verbosity=3, warm_start=False)

In [49]:
tp.fit(X_train, y_train)

29 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=2550, style=ProgressStyle(descrip…

_pre_test decorator: _random_mutation_operator: num_test=0 '(slice(None, None, None), 0)' is an invalid key.
_pre_test decorator: _random_mutation_operator: num_test=0 Expected n_neighbors <= n_samples,  but n_samples = 50, n_neighbors = 60.
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l2' and loss='epsilon_insensitive' are not supported when dual=False, Parameters: penalty='l2', loss='epsilon_insensitive', dual=False.
Generation 1 - Current Pareto front scores:
-1	-0.002092170384332432	RidgeCV(input_matrix)
-2	-0.0020540081245978875	RidgeCV(SelectPercentile(input_matrix, SelectPercentile__percentile=34))

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..
_pre_test decorator: _random_mutation_operator: num_test=1 '(slice(None, None, None), 0)' is an invalid key.
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupp



Generation 4 - Current Pareto front scores:
-1	-0.002092170384332432	RidgeCV(input_matrix)
-2	-0.0020224082303179837	RidgeCV(SelectPercentile(input_matrix, SelectPercentile__percentile=63))

_pre_test decorator: _random_mutation_operator: num_test=0 Automatic alpha grid generation is not supported for l1_ratio=0. Please supply a grid by providing your estimator with the appropriate `alphas=` argument..
_pre_test decorator: _random_mutation_operator: num_test=0 The condensed distance matrix must contain only finite values..
_pre_test decorator: _random_mutation_operator: num_test=0 feature_names mismatch: ['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch

Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.






TPOTRegressor(config_dict=None, crossover_rate=0.1, cv=5,
       disable_update_check=False, early_stop=None, generations=50,
       max_eval_time_mins=5, max_time_mins=None, memory=None,
       mutation_rate=0.9, n_jobs=-1, offspring_size=None,
       periodic_checkpoint_folder=None, population_size=50,
       random_state=None, scoring=None, subsample=1.0, template=None,
       use_dask=True, verbosity=3, warm_start=False)



In [24]:
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import MinMaxScaler

In [50]:
tp.export('tpot_pipeline.py')

In [27]:
from sklearn.pipeline import make_pipeline

In [28]:
exported_pipeline = make_pipeline(
    MinMaxScaler(),
    ElasticNetCV(l1_ratio=0.45, tol=1e-05)
)

In [29]:
exported_pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('elasticnetcv', ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=0.45, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=1e-05, verbose=0))])

In [32]:
results = exported_pipeline.predict(X_test)

In [33]:
mse = mean_squared_error(y_test, results)

NameError: name 'mean_squared_error' is not defined

In [38]:
now = datetime.now()
df_sub.to_csv(os.path.join('submissions', f'ElasticNetCV_{now:%Y%m%d%H%M%S}.csv'))

In [42]:
df_pred = pd.read_csv(os.path.join('data','test.csv'))
df_pred = process_df(df_pred, is_test=True)

c_in_train = [c for c in df_pred.columns.values if c in X_train]
c_nin_pred = [c for c in X_train.columns.values if c not in df_pred]

df_pred = df_pred.loc[:, c_in_train]

for c in c_nin_pred:
    df_pred[c] = 0
    
df_pred = df_pred[X_train.columns]

In [43]:
results = exported_pipeline.predict(df_pred)

In [45]:
df_sub = pd.DataFrame(results, index=df_pred['Id'], columns=['SalePrice'])

In [46]:
df_sub['SalePrice'] = inv_boxcox(df_sub['SalePrice'], lmbda_opts['SalePrice'])

In [47]:
now = datetime.now()
df_sub.to_csv(os.path.join('submissions', f'ElasticNetCV_{now:%Y%m%d%H%M%S}.csv'))