# ElasticSearch Test

In [1]:
from elasticsearch import Elasticsearch
import pandas as pd
import datetime
from statsmodels.tsa.ar_model import AR

Create Elasticsearch queue-prediction index

In [2]:
# ignore 400 cause by IndexAlreadyExistsException when creating an index
es = Elasticsearch()
# es = Elasticsearch(
#       ['localhost'],
#       http_auth=(username, 'password'),
#       verify_certs=False,
#       scheme="https",
#       port=443,
# )
es.indices.create(index='queues-prediction', ignore=400) #can be ignored

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'queues-prediction'}

### Load from Queue Index

Match products queue

In [3]:
res = es.search(index="queues", body={"query": {
                                            "match": {
                                                "name" : "products"
                                                }}}, size=1000) #define size

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [queues]', queues, index_or_alias)

Get _source Data

In [None]:
d = [elem['_source'] for elem in res['hits']['hits']]

In [None]:
for elem in d:
    del elem['items']
    del elem['querytime']

Build Dataframe

In [None]:
df = pd.DataFrame(d)

In [None]:
df.index = df["timestamp"]

In [None]:
df.index = pd.to_datetime(df.index, format='%Y-%m-%dT%H:%M:%S.%f%z').sort_values()

In [None]:
df.drop(columns=['timestamp', 'name', 'tier'], inplace=True)

Resample Data to 1min Interval 

In [None]:
df = df.resample('1T').mean()

In [None]:
df

In [None]:
df = df.fillna(0)

In [None]:
df.head()

## ML Model

#### Train/Test Split

In [None]:
pred_horizon = 20 #minutes

In [None]:
data = df['size']

#### AR

In [None]:
model_ar = AR(data)
model_ar_fit = model_ar.fit(maxlag= 10,ic='bic', trend='nc', method='cmle', maxiter=20)

In [None]:
pred_test = model_ar_fit.predict(start=len(data), end=len(data)+pred_horizon-1, dynamic=True)

Build Prediction Dataframe

In [None]:
time_stamps = pd.date_range(start=data.index[-1]+datetime.timedelta(minutes=1), periods=20, freq='T')

In [None]:
d = {'timestamp': time_stamps, 'size': pred_test}
pred_df = pd.DataFrame(data=d)

In [None]:
pred_df['timestamp'] = pred_df.timestamp.map(lambda x: datetime.datetime.strftime(x, '%Y-%m-%dT%H:%M:%S.%f%z'))

In [None]:
pred_df

In [None]:
count = 0
for index, row in pred_df.iterrows():
    doc_data = {
        'timestamp': row['timestamp'],
        'tier' : 'pic',
        'name' : 'products',
    #     'querytime' : 0,
        'size' : row['size'],
    #     'items' : " ".join(items)
    }
    count += 1
    es.index('queues-prediction', body=doc_data)
    if count % 5 == 0:
        print(str(count) + " Elemente hochgeladen")