In [22]:
#import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy import stats
from scipy.stats import zscore, norm
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
bmw = pd.read_csv('regression/bmw.csv')

In [4]:
bmw.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0


In [9]:
bmw['split'] = np.random.randn(bmw.shape[0], 1)

msk = np.random.rand(len(bmw)) <= 0.7

train_bmw = bmw[msk]
test_bmw = bmw[~msk]

In [10]:
train_bmw.to_csv('train_bmw.csv')

In [11]:
test_bmw.to_csv('test_bmw.csv')

### AutoML

In [1]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "15.0.2" 2021-01-19; Java(TM) SE Runtime Environment (build 15.0.2+7-27); Java HotSpot(TM) 64-Bit Server VM (build 15.0.2+7-27, mixed mode, sharing)
  Starting server from /Users/siyushen/opt/anaconda3/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/sf/r8yh9s3s27j3b7l1wk0_fb2m0000gn/T/tmp1pfywxdr
  JVM stdout: /var/folders/sf/r8yh9s3s27j3b7l1wk0_fb2m0000gn/T/tmp1pfywxdr/h2o_siyushen_started_from_python.out
  JVM stderr: /var/folders/sf/r8yh9s3s27j3b7l1wk0_fb2m0000gn/T/tmp1pfywxdr/h2o_siyushen_started_from_python.err
  Server is running at http://127.0.0.1:54325
Connecting to H2O server at http://127.0.0.1:54325 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.3
H2O_cluster_version_age:,"7 days, 20 hours and 51 minutes"
H2O_cluster_name:,H2O_from_python_siyushen_3f80us
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [2]:
bmw_train = h2o.import_file("https://raw.githubusercontent.com/rebeccaisnotabug/autoML/master/train_bmw.csv")
bmw_test = h2o.import_file("https://raw.githubusercontent.com/rebeccaisnotabug/autoML/master/test_bmw.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
bmw_train.head()

model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0
5 Series,2016,14900,Automatic,35309,Diesel,125,60.1,2.0
5 Series,2017,16000,Automatic,38538,Diesel,125,60.1,2.0
2 Series,2018,16250,Manual,10401,Petrol,145,52.3,1.5
4 Series,2017,14250,Manual,42668,Diesel,30,62.8,2.0
5 Series,2016,14250,Automatic,36099,Diesel,20,68.9,2.0




In [4]:
bmw_test.head()

model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
X3,2017,15500,Manual,74907,Diesel,145,52.3,2.0
3 Series,2017,16000,Automatic,45456,Diesel,30,64.2,2.0
1 Series,2016,14300,Automatic,22461,Diesel,20,67.3,2.0
3 Series,2019,17800,Automatic,22310,Diesel,145,64.2,2.0
3 Series,2016,14400,Automatic,51994,Diesel,30,62.8,2.0
X4,2017,23000,Automatic,34960,Diesel,150,54.3,2.0
2 Series,2017,13000,Automatic,61818,Other,0,141.2,1.5
1 Series,2018,14600,Automatic,6522,Petrol,145,37.2,1.5
4 Series,2016,15500,Automatic,45856,Diesel,30,65.7,2.0
3 Series,2017,18500,Automatic,27139,Diesel,160,51.4,3.0




In [5]:
x = bmw_train.columns
y = "price"
x.remove(y)

In [6]:
aml = H2OAutoML(max_models = 10, seed = 1)
aml.train(x=x, y=y, training_frame = bmw_train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [7]:
lb = aml.leaderboard
lb

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20210527_145242,7313060.0,2704.27,7313060.0,1531.14,0.102971
StackedEnsemble_BestOfFamily_AutoML_20210527_145242,7374060.0,2715.52,7374060.0,1541.43,0.103063
GBM_3_AutoML_20210527_145242,7469550.0,2733.05,7469550.0,1564.92,0.104602
GBM_4_AutoML_20210527_145242,7504310.0,2739.4,7504310.0,1541.65,0.104358
GBM_2_AutoML_20210527_145242,7603990.0,2757.53,7603990.0,1575.02,0.104993
GBM_1_AutoML_20210527_145242,7902530.0,2811.14,7902530.0,1570.38,0.107202
DRF_1_AutoML_20210527_145242,8318870.0,2884.24,8318870.0,1616.69,0.111408
XGBoost_1_AutoML_20210527_145242,8386310.0,2895.91,8386310.0,1680.77,0.116134
GBM_5_AutoML_20210527_145242,8522350.0,2919.31,8522350.0,1648.35,0.111958
XGBoost_3_AutoML_20210527_145242,8544750.0,2923.14,8544750.0,1689.22,0.116488




In [9]:
pred = aml.predict(bmw_test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [11]:
perf = aml.leader.model_performance(bmw_test)
perf


ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 4584367.829032802
RMSE: 2141.11368895554
MAE: 1437.529151887799
RMSLE: 0.09086622789563475
R^2: 0.9657410110848172
Mean Residual Deviance: 4584367.829032802
Null degrees of freedom: 3230
Residual degrees of freedom: 3224
Null deviance: 432367001154.14514
Residual deviance: 14812092455.604984
AIC: 58742.78473112541


