## Importing External Models into Elasticsearch using Eland

In [1]:
import eland as ed
import numpy as np
from elasticsearch import Elasticsearch

In [25]:
ES_URL = "insert your own ES URL here"
ES_USERNAME = "insert your username"
ES_PASSWORD = "insert your password" 

In [28]:
es_client = Elasticsearch(ES_URL, http_auth=(ES_USERNAME, ES_PASSWORD))

## Training a Decision Tree using Scikit-Learn

In [4]:
# import a variation of the breast cancer dataset we have been using in earlier chapters
from sklearn.datasets import load_breast_cancer

# import the function that trains a DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# import a helper function to generate the test/train split
from sklearn.model_selection import train_test_split

In [5]:
# let's load the dataset and store the datapoints in the variable X and the class labels in the variable y
X, y = load_breast_cancer(return_X_y=True)

In [6]:
# in contrast with Elasticsearch, features and labels are stored in separate variables not the same document

# a sample entry in the matrix represented by variable X

X[0]

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [7]:
# a sample entry in the matrix represented by the variable y

y[0]

0

In [8]:
# while Elasticsearch performs the train/test split for us during the training process
# in scikit-learn, we have to perform this step manually using the train_test_split function 

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=12345)

In [9]:
# now, let's create the decision tree classifier

dec_tree = DecisionTreeClassifier(random_state=12345).fit(X_train, y_train)


In [10]:
# we can now use this trained model to predict which class the datapoints in our X_test set belong to
# for example, 

dec_tree.predict([X_test[0]])

array([1])

In [11]:
# let's check to see if this matches the actual class label

y_test[0]

1

## Importing a model into Elasticsearch

In [12]:
# import the required eland class
from eland.ml import MLModel

### Get the feature names

In [18]:
data = load_breast_cancer()
feature_names = data.feature_names

In [20]:
feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [21]:
model_id = "external-model_breast-cancer-decision-tree"

In [33]:
es_model = MLModel.import_model(
    es_client,
    model_id=model_id,
    model=dec_tree,
    feature_names=list(feature_names),
    es_if_exists='replace'
)

In [34]:
es_model.predict(X_test[0])

array([1])