# Python HANA ML API

<div class="alert alert-block alert-info">
<b>Time series forecasting of monthly ozone rate.</b> <br>
</div>

### Create an HANA Dataframe for the actual series

In [13]:
# Connect using the HANA secure user store
from hana_ml import dataframe as hd
conn = hd.ConnectionContext(userkey='MLMDA_KEY')
# Get Series Data
sql_cmd = 'SELECT * FROM "APL_SAMPLES"."OZONE_RATE_LA" ORDER BY "Date"'
series_data = hd.DataFrame(conn, sql_cmd)

### Put the series in a Pandas Dataframe and show individual values

In [14]:
import pandas as pd
import datetime as dt
import numpy as np
actual_df = series_data.collect()
actual_df['Date'] = pd.to_datetime(actual_df['Date'])
df = actual_df.copy()
df['OzoneRateLA'] = pd.to_numeric(df['OzoneRateLA'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
del df['Date']
pd.pivot_table(df, index='Month', columns=['Year']).T

Unnamed: 0_level_0,Month,1,2,3,4,5,6,7,8,9,10,11,12
Unnamed: 0_level_1,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
OzoneRateLA,1955,2.63,1.94,3.38,4.92,6.29,5.58,5.5,4.71,6.04,7.13,7.79,3.83
OzoneRateLA,1956,3.83,4.25,5.29,3.75,4.67,5.42,6.04,5.71,8.13,4.88,5.42,5.5
OzoneRateLA,1957,3.0,3.42,4.5,4.25,4.0,5.33,5.79,6.58,7.29,5.04,5.04,4.48
OzoneRateLA,1958,3.33,2.88,2.5,3.83,4.17,4.42,4.25,4.08,4.88,4.54,4.25,4.21
OzoneRateLA,1959,2.75,2.42,4.5,5.21,4.0,7.54,7.38,5.96,5.08,5.46,4.79,2.67
OzoneRateLA,1960,1.71,1.92,3.38,3.98,4.63,4.88,5.17,4.83,5.29,3.71,2.46,2.17
OzoneRateLA,1961,2.15,2.44,2.54,3.25,2.81,4.21,4.13,4.17,3.75,3.83,2.42,2.17
OzoneRateLA,1962,2.33,2.0,2.13,4.46,3.17,3.25,4.08,5.42,4.5,4.88,2.83,2.75
OzoneRateLA,1963,1.63,3.04,2.58,2.92,3.29,3.71,4.88,4.63,4.83,3.42,2.38,2.33
OzoneRateLA,1964,1.5,2.25,2.63,2.96,3.46,4.33,5.42,4.79,4.38,4.54,2.04,1.33


### Forecast with APL

#### Running the forecast

In [15]:
# Specify the model  
from hana_ml.algorithms.apl.time_series import AutoTimeSeries
model = AutoTimeSeries(
    conn_context=conn, 
    time_column_name= 'Date',
    target= 'OzoneRateLA',
    horizon= 12
    )
# Run the model 
output_data = model.fit_predict(series_data)

##### Show Model Summary

In [16]:
summary_df = model.get_summary().collect()
df = summary_df[summary_df['KEY'].isin(['ModelVariableCount','ModelRecordCount','ModelBuildDate',
                        'ModelTimeSeriesFirstDate','ModelTimeSeriesLastDate','ModelTimeSeriesHorizon'])].copy()
df['KEY'] = df['KEY'].str.replace('Model','').str.replace('TimeSeries','')
df['KEY'] = df['KEY'].str.replace('Count',' Count').str.replace('Date',' Date')
df = df[['KEY','VALUE']]
df.columns = ['Property', 'Value']
df.style.hide_index()

Property,Value
Variable Count,2
Record Count,204
Build Date,2020-01-03 16:25:58
First Date,1955-01-28
Last Date,1971-12-28
Horizon,12


##### Plotting the predicted values

In [17]:
# Make the output a Pandas Dataframe
forecast_df = output_data.collect()
# Build a line chart
import hvplot.pandas
forecast_df.hvplot.line(
 'Date' , ['ACTUAL','PREDICTED'], 
 value_label='Ozone Rate', 
 title = 'Monthly Ozone Rate in LA',
 fontsize={'title': 10, 'labels': 10},
 legend = 'bottom', height = 350, width = 900
) *\
forecast_df.hvplot.area(
 'Date' , 'LOWER_INT_95PCT', 'UPPER_INT_95PCT', 
 line_color = 'white', color = 'orange', alpha=0.2
)

##### Components found

In [18]:
d = model.get_model_components()
components_df = pd.DataFrame(list(d.items()), columns=["Component", "Value"])
components_df.style.hide_index()

Component,Value
Trend,Polynom( Date)
Cycles,12
Fluctuations,


##### Performance Indicators

In [19]:
d = model.get_performance_metrics()
# Average each indicator across the horizon time window
apm = []
for k, v in d.items():
   apm.append((k, round(np.mean(v),4)))
# Put the results in a dataframe
accuracy_df = pd.DataFrame(apm, columns=["Indicator", "Value"])
df = accuracy_df[accuracy_df['Indicator'].isin(['MAPE','MeanAbsoluteError','SMAPE','RootMeanSquareError','R2'])].copy()
df['Indicator'] = df['Indicator'].str.replace('MeanAbsoluteError','MAE').str.replace('RootMeanSquareError','RMSE')
df.style.hide_index()

Indicator,Value
MAPE,0.1827
MAE,0.4671
R2,0.5564
RMSE,0.5948
SMAPE,0.1669


##### Descriptive Statistics

In [20]:
indicators_df = model.get_indicators().collect()
df = indicators_df[(indicators_df.KEY!='CategoryFrequency') & (indicators_df.VARIABLE=='OzoneRateLA')].copy()
df['VALUE'] = df['VALUE'].astype(float).round(4)
df = df[['VARIABLE','KEY','VALUE']]
df.columns = ['Target', 'Statistic','Value']
df.style.hide_index()

Target,Statistic,Value
OzoneRateLA,Min,1.33
OzoneRateLA,Max,8.13
OzoneRateLA,Mean,3.9839
OzoneRateLA,StandardDeviation,1.4301


##### Outliers

In [21]:
log_df = model.get_fit_operation_log().collect()
df = log_df[log_df.MESSAGE.str.contains("outlier has been detected at time point")].copy()
df['Outlier'] = df['MESSAGE'].str.extract("\((.*)\)")
df = actual_df[actual_df['Date'].isin(df['Outlier'])].copy()
df['Date'] = df['Date'].dt.date
df.style.hide_index()

Date,OzoneRateLA
1955-11-28,7.79
1956-09-28,8.13
1967-10-28,5.46
1968-09-28,4.73
1969-08-28,5.25
