Use below code for installation

pip install evalml

# Titanic Dataset

In [2]:
import evalml

In [4]:
import numpy as np 
import pandas as pd

In [5]:
data = pd.read_csv("titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
X = data.drop("Survived", axis = 1)
y = data.Survived

In [7]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='binary')

In [8]:
X_train.head()

Data Column,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Physical Type,Int64,Int64,string,category,float64,Int64,Int64,category,float64,category,category
Logical Type,Integer,Integer,NaturalLanguage,Categorical,Double,Integer,Integer,Categorical,Double,Categorical,Categorical
Semantic Tag(s),['numeric'],['numeric'],[],['category'],['numeric'],['numeric'],['numeric'],['category'],['numeric'],['category'],['category']
502,503,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q
464,465,3,"Maisner, Mr. Simon",male,,0,0,A/S 2816,8.05,,S
198,199,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q
765,766,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51.0,1,0,13502,77.9583,D11,S
421,422,3,"Charters, Mr. David",male,21.0,0,0,A/5. 13032,7.7333,,Q


## Running the AutoML to select the best algorithm

In [9]:
import evalml
evalml.problem_types.ProblemTypes.all_problem_types

[<ProblemTypes.BINARY: 'binary'>,
 <ProblemTypes.MULTICLASS: 'multiclass'>,
 <ProblemTypes.REGRESSION: 'regression'>,
 <ProblemTypes.TIME_SERIES_REGRESSION: 'time series regression'>,
 <ProblemTypes.TIME_SERIES_BINARY: 'time series binary'>,
 <ProblemTypes.TIME_SERIES_MULTICLASS: 'time series multiclass'>]

In [10]:
from evalml.automl import AutoMLSearch
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='binary')
automl.search()

Using default limit of max_batches=1.

Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Binary. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: random_forest, xgboost, extra_trees, linear_model, catboost, decision_tree, lightgbm



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 13.243
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 1.887
Batch 1: (3/9) LightGBM Classifier w/ Imputer + Text... Elapsed:00:07
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.601
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + T... Elapsed:00:13
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.446
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + T... Elapsed:00:18
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.666
Batch 1: (6/9) CatBoost Classifier w/ Imputer + Text... Elapsed:00:23
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.611
Batch 1: (7/9) XGBoost Classifier w/ Imputer + Text ... Elapsed:00:28
	Star

In [11]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,7,Random Forest Classifier w/ Imputer + Text Fea...,0.437574,0.45378,96.695844,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,3,Extra Trees Classifier w/ Imputer + Text Featu...,0.44585,0.457656,96.633355,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,8,Logistic Regression Classifier w/ Imputer + Te...,0.490002,0.492738,96.299958,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,6,XGBoost Classifier w/ Imputer + Text Featuriza...,0.544524,0.503671,95.888259,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,2,LightGBM Classifier w/ Imputer + Text Featuriz...,0.601131,0.585976,95.460816,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,5,CatBoost Classifier w/ Imputer + Text Featuriz...,0.611254,0.613893,95.384379,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,4,Elastic Net Classifier w/ Imputer + Text Featu...,0.665875,0.665509,94.97193,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,1,Decision Tree Classifier w/ Imputer + Text Fea...,1.887477,2.10923,85.747521,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,0,Mode Baseline Binary Classification Pipeline,13.24315,13.206003,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}


In [12]:
automl.best_pipeline

GeneratedPipeline(parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, 'One Hot Encoder':{'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}, 'Random Forest Classifier':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1},})

In [13]:
best_pipeline=automl.best_pipeline

## Let's Check the detailed desscription

In [14]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])

****************************************************************************************
* Random Forest Classifier w/ Imputer + Text Featurization Component + One Hot Encoder *
****************************************************************************************

Problem Type: binary
Model Family: Random Forest

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. Text Featurization Component
3. One Hot Encoder
	 * top_n : 10
	 * features_to_encode : None
	 * categories : None
	 * drop : if_binary
	 * handle_unknown : ignore
	 * handle_missing : error
4. Random Forest Classifier
	 * n_estimators : 100
	 * max_depth : 6
	 * n_jobs : -1

Training
Training for binary problems.
Total training time (including CV): 5.7 seconds

Cross Validation
----------------
             Log Loss Binary  MCC Binary   AUC  Precision    F1  Balanced Accuracy Binary  Accuracy Binary 

In [15]:

### Evaluate on hold out data
best_pipeline.score(X_test, y_test, objectives=["auc","f1","Precision","Recall"])

OrderedDict([('AUC', 0.8213438735177866),
             ('F1', 0.736842105263158),
             ('Precision', 0.765625),
             ('Recall', 0.7101449275362319)])

## We can also optimize for a problem specific objective

In [16]:

automl_auc = AutoMLSearch(X_train=X_train, y_train=y_train,
                          problem_type='binary',
                          objective='auc',
                          additional_objectives=['f1', 'precision'],
                          max_batches=1,
                          optimize_thresholds=True)

automl_auc.search()

Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for AUC. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: random_forest, xgboost, extra_trees, linear_model, catboost, decision_tree, lightgbm



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean AUC: 0.500
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean AUC: 0.799
Batch 1: (3/9) LightGBM Classifier w/ Imputer + Text... Elapsed:00:05
	Starting cross validation
	Finished cross validation - mean AUC: 0.833
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + T... Elapsed:00:11
	Starting cross validation
	Finished cross validation - mean AUC: 0.868
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + T... Elapsed:00:17
	Starting cross validation
	Finished cross validation - mean AUC: 0.500
Batch 1: (6/9) CatBoost Classifier w/ Imputer + Text... Elapsed:00:22
	Starting cross validation
	Finished cross validation - mean AUC: 0.862
Batch 1: (7/9) XGBoost Classifier w/ Imputer + Text ... Elapsed:00:27
	Starting cross validation
	Finished cross validation - mean AUC: 0.830
Batch 

In [17]:
automl_auc.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,3,Extra Trees Classifier w/ Imputer + Text Featu...,0.867829,0.852545,36.782875,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,7,Random Forest Classifier w/ Imputer + Text Fea...,0.866402,0.856694,36.640175,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,5,CatBoost Classifier w/ Imputer + Text Featuriz...,0.861776,0.844173,36.177631,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,8,Logistic Regression Classifier w/ Imputer + Te...,0.860761,0.8549,36.07608,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,2,LightGBM Classifier w/ Imputer + Text Featuriz...,0.83343,0.847948,33.342959,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,6,XGBoost Classifier w/ Imputer + Text Featuriza...,0.829639,0.854489,32.963875,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,1,Decision Tree Classifier w/ Imputer + Text Fea...,0.799286,0.802833,29.928602,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,0,Mode Baseline Binary Classification Pipeline,0.5,0.5,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}
8,4,Elastic Net Classifier w/ Imputer + Text Featu...,0.5,0.5,0.0,False,{'Imputer': {'categorical_impute_strategy': 'm...


In [18]:
automl_auc.describe_pipeline(automl_auc.rankings.iloc[0]["id"])

**************************************************************************************
* Extra Trees Classifier w/ Imputer + Text Featurization Component + One Hot Encoder *
**************************************************************************************

Problem Type: binary
Model Family: Extra Trees

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. Text Featurization Component
3. One Hot Encoder
	 * top_n : 10
	 * features_to_encode : None
	 * categories : None
	 * drop : if_binary
	 * handle_unknown : ignore
	 * handle_missing : error
4. Extra Trees Classifier
	 * n_estimators : 100
	 * max_features : auto
	 * max_depth : 6
	 * min_samples_split : 2
	 * min_weight_fraction_leaf : 0.0
	 * n_jobs : -1

Training
Training for binary problems.
Total training time (including CV): 5.8 seconds

Cross Validation
----------------
              AUC    F1  Precisi

In [19]:
best_pipeline_auc = automl_auc.best_pipeline

In [20]:

# get the score on holdout data
best_pipeline_auc.score(X_test, y_test,  objectives=["auc"])

OrderedDict([('AUC', 0.8338603425559948)])

In [21]:
best_pipeline.save("model.pkl")

## Loading the Model

In [22]:
check_model=automl.load('model.pkl')

In [23]:
check_model.predict_proba(X_test).to_dataframe()

Unnamed: 0,0,1
0,0.877739,0.122261
1,0.919006,0.080994
2,0.608192,0.391808
3,0.866045,0.133955
4,0.053808,0.946192
...,...,...
174,0.714361,0.285639
175,0.179811,0.820189
176,0.830416,0.169584
177,0.897032,0.102968
