### Replace

In [1]:
from hana_ml import dataframe
from hana_ml.algorithms.pal.utility import DataSets, Settings
url, port, user, pwd = Settings.load_config("../../config/e2edata.ini")

connection_context = dataframe.ConnectionContext(url, port, user, pwd)

In [2]:
import pandas as pd
pf = pd.DataFrame({'Aa': [0, 10, 2, 3, 4],
                   'Bb': [5, 0, 7, 8, 9],
                   'Cb': ['a', 'b', 'c', 'd', 'e']})
pf2 = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                    'B': ['abc', 'bar', 'xyz']})
df = dataframe.create_dataframe_from_pandas(connection_context, pandas_df=pf, table_name="#replacedev", force=True)
df2 = dataframe.create_dataframe_from_pandas(connection_context, pandas_df=pf2, table_name="#replacedev2", force=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.44it/s]


In [None]:
df.collect()


In [None]:
df.replace(to_replace=0, value=5).collect()


In [None]:
df.replace(to_replace={0: 10, 2: 100}).collect()


In [None]:
df.replace(to_replace={'Aa': 0, 'Bb': 5}, value=100).collect()


In [None]:
df.replace(to_replace={'Aa': 0, 'Bb': 5}, value={'Aa': 100, 'Bb': 50}).collect()


In [None]:
df.replace(to_replace={'Aa': {0: 100, 4: 400}}).collect()


In [None]:
df2.collect()

In [None]:
df2.replace(to_replace=r'^ba.$', value='new', regex=True).collect()


In [None]:
df2.replace(to_replace={'A': r'^ba.$'}, value={'A': 'new'}, regex=True).collect()


In [None]:
df2.replace(regex=r'^ba.$', value='new').collect()


In [None]:
df2.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}).collect()


### MLFlow integartion for auto-ml

In [None]:
full_set, training_set, validation_set, test_set = DataSets.load_boston_housing_data(connection_context)

In [None]:
features=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'BLACK', 'LSTAT']
label='MEDV'
# Cast to correct types so PAL can consume it.
dfts = training_set.cast(['CRIM', "ZN", "INDUS", "NOX", "RM", "AGE", "DIS", "PTRATIO", "BLACK", "LSTAT", "MEDV"], "DOUBLE")
dfts = dfts.cast(["CHAS", "RAD", "TAX"], "INTEGER")
dfts = dfts.to_head("ID")
dfts.head(5).collect()

In [None]:
# Useful for multiple runs (only doing one run in this sample notebook)
import mlflow
from hana_ml.algorithms.pal.auto_ml import AutomaticRegression
model = AutomaticRegression()
model.disable_workload_class_check()
runid = None
with mlflow.start_run() as run:
    model.enable_mlflow_autologging()
    model.fit(dfts, key="ID", features=features, label=label)
    runid = run.info.run_id


In [None]:
from hana_ml.model_storage import ModelStorage
mymodel = ModelStorage.load_mlflow_model(connection_context=connection_context, model_uri='runs:/{}/model'.format(runid))
mymodel

### Pipeline module enhancement with PAL_PIPELINE_FIT and PAL_PIPELINE_PREDICT.

In [None]:
from hana_ml.algorithms.pal.pipeline import Pipeline
from hana_ml.algorithms.pal.decomposition import PCA
from hana_ml.algorithms.pal.preprocessing import Imputer
from hana_ml.algorithms.pal.trees import HybridGradientBoostingClassifier
from hana_ml.algorithms.pal.utility import DataSets, Settings
my_pipeline = Pipeline([
                    ('PCA', PCA(scaling=True, scores=True)),
                    ('HGBT_Classifier', HybridGradientBoostingClassifier(
                                            n_estimators=4, split_threshold=0,
                                            learning_rate=0.5, fold_num=5,
                                            max_depth=6))])


In [None]:
diabetes_full, diabetes_train, diabetes_test, _ = DataSets.load_diabetes_data(connection_context)

In [None]:
my_pipeline.fit(diabetes_train, key="ID", label="CLASS")


In [None]:
print(connection_context.last_execute_statement)

In [None]:
result = my_pipeline.predict(diabetes_test.deselect("CLASS"), key="ID")

In [None]:
result.collect()

### Added AutoML Time Series and make_future_dataframe.

In [None]:
shampoo_data = DataSets.load_shampoo_data(connection_context)

In [None]:
shampoo_data.tail(2).collect()

In [None]:
from hana_ml.algorithms.pal.auto_ml import AutomaticTimeSeries
from hana_ml.visualizers.automl_progress import PipelineProgressStatusMonitor

model = AutomaticTimeSeries(early_stop=2)
progress_status_monitor = PipelineProgressStatusMonitor(connection_context, automatic_obj=model)

progress_status_monitor.start()

model.disable_workload_class_check()
model.fit(shampoo_data, key="ID")


In [None]:
model.best_pipeline_.collect()

In [None]:
predict_frame = model.make_future_dataframe(periods=10)

In [None]:
result = model.predict(predict_frame, key="ID")
result.collect()

### Outlier Detection for Time Series

In [None]:
from hana_ml.algorithms.pal.tsa.outlier_detection import OutlierDetectionTS

op = OutlierDetectionTS(threshold=2)

result = op.fit_predict(shampoo_data, key="ID")



In [None]:
result.collect()

In [None]:
from hana_ml.visualizers.eda import plot_time_series_outlier
plot_time_series_outlier(shampoo_data, key="ID", threshold=2)


### Missing value handling for time series

In [None]:
emp_data  = dataframe.create_dataframe_from_pandas(connection_context, pandas_df=pd.read_csv("../datasets/usa_edu_employment_rate.csv"), table_name="#usa_edu_emp_rate", force=True)

In [None]:
emp_data.head(10).collect()

In [None]:
from hana_ml.algorithms.pal.preprocessing import ImputeTS

impute_ts = ImputeTS(thread_ratio=-1, imputation_type='most_frequent-mean')
result = impute_ts.fit_transform(emp_data, key="YEAR", col_imputation_type={"PSC__TRE": 'mean',
                                                                            "MAT__WLA": 'linterp'})

In [None]:
result.collect()

### Force plot

In [None]:
from hana_ml.algorithms.pal.model_selection import GridSearchCV
from hana_ml.algorithms.pal.unified_classification import UnifiedClassification
uc_hgbdt = UnifiedClassification('HybridGradientBoostingTree')

gscv = GridSearchCV(estimator=uc_hgbdt, 
                    param_grid={'learning_rate': [0.1, 0.4, 0.7, 1],
                                'n_estimators': [4, 6, 8, 10],
                                'split_threshold': [0.1, 0.4, 0.7, 1]},
                    train_control=dict(fold_num=5,
                                       resampling_method='cv',
                                       random_state=1,
                                       ref_metric=['auc']),
                    scoring='error_rate')
gscv.fit(data=diabetes_train, key= 'ID',
         label='CLASS',
         partition_method='stratified',
         partition_random_state=1,
         stratified_column='CLASS',
         build_report=True)
features = diabetes_train.columns
features.remove('CLASS')
features.remove('ID')

pred_res = gscv.predict(diabetes_test, key='ID', features=features)

In [None]:
from hana_ml.visualizers.shap import ShapleyExplainer
shapley_explainer = ShapleyExplainer(feature_data=diabetes_test.select(features), reason_code_data=pred_res.select('REASON_CODE'))
shapley_explainer.force_plot()

### Store model report in model storage

In [None]:
from hana_ml.model_storage import ModelStorage

ms = ModelStorage(connection_context)
gscv.estimator.name = "HGBT with Report"
ms.save_model(gscv.estimator, save_report=True)

In [None]:
ms.display_model_report("HGBT with Report", version=1)

### Time Series Report

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.linalg import cholesky
import numpy as np
from numpy.random import rand

num_samples = 600
S1 = 12
S2 = 100

np.random.seed(seed=2334)

x1 = norm.rvs(loc=0, scale=1, size=(1, num_samples))[0]
x2 = norm.rvs(loc=0, scale=1, size=(1, num_samples))[0]
x3 = norm.rvs(loc=0, scale=1, size=(1, num_samples))[0]
x4 = norm.rvs(loc=0, scale=1, size=(1, num_samples))[0]

std_m = np.array([
    [6.8, 0, 0, 0],
    [0, 1.4, 0, 0],
    [0, 0, 1.4, 0],
    [0, 0, 0, 2.9]
])

# specify desired correlation
corr_m = np.array([
    [1, .35, 0.33, 0.78],
    [.35, 1, 0.90, 0.28],
    [.33, 0.90, 1, 0.27],
    [.78, 0.28, 0.27, 1]
])

# calc desired covariance (vc matrix)
cov_m = np.dot(std_m, np.dot(corr_m, std_m))
L = cholesky(cov_m, lower=True)
corr_data = np.dot(L, [x1, x2, x3, x4]).T

beta=np.array([-3.49, 13, 13, 0.0056])
omega1 = 2*np.pi/S1
omega2 = 2*np.pi/S2
timestamp = np.array([i for i in range(num_samples)])
y1 = np.multiply(50*rand(num_samples), 20*rand(1)*np.cos(omega1*timestamp)) \
+ np.multiply(32*rand(num_samples), 30*rand(1)*np.cos(3*omega1*timestamp)) \
+ np.multiply(rand(num_samples), rand(1)*np.sin(omega2*timestamp)) 

y2 = np.multiply(rand(num_samples), timestamp)
y3 = corr_data.dot(beta.T)
y = y1 + y2 + y3


demo_data =dataframe.create_dataframe_from_pandas(connection_context,
                                                  pd.DataFrame({'ID':pd.date_range('2018-01-01', '2019-08-23',freq='D'), 'Y':y, 'X1':corr_data[:,0], 'X2':corr_data[:,1], 'X3':corr_data[:,2], 'X4':corr_data[:,3]}),
                                                  table_name='#PAL_TIMESERIES_REPORT', force=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.17it/s]


In [4]:
from hana_ml.visualizers.time_series_report import TimeSeriesReport, DatasetAnalysis
from hana_ml.visualizers.report_builder import Page

In [6]:
report = TimeSeriesReport('Time Series Report 001')
dataset_analysis = DatasetAnalysis(data=demo_data, endog="Y", key="ID")

In [7]:
pages = []

page0 = Page('Stationarity')
page0.addItem(dataset_analysis.stationarity_item())
pages.append(page0)

page1 = Page('Partial Autocorrelation')
page1.addItem(dataset_analysis.pacf_item())
pages.append(page1)

page2 = Page('Rolling Mean and Standard Deviation')
page2.addItems([dataset_analysis.moving_average_item(-3), dataset_analysis.rolling_stddev_item(10)])
pages.append(page2)

report.addPages(pages)

In [8]:
report.build()
report.generate_notebook_iframe()

In [9]:
page3 = Page('Seasonal')
page3.addItem(dataset_analysis.seasonal_item())
page3.addItems(dataset_analysis.seasonal_decompose_items())
pages.append(page3)

page4 = Page('Box')
for cycle in ['YEAR', 'MONTH', 'QUARTER']:
    page4.addItem(dataset_analysis.timeseries_box_item(cycle))
pages.append(page4)

page5 = Page('Quarter')
page5.addItem(dataset_analysis.quarter_item())
pages.append(page5)

page6 = Page('Outlier')
page6.addItem(dataset_analysis.outlier_item())
pages.append(page6)

report.addPages(pages)

In [10]:
report.build()
report.generate_notebook_iframe()