# Examples of DataFrame usage

In [1]:
from hana_ml import dataframe
import numpy as np
import logging

## Setup connection and data sets
Let us load some data into HAAN table.  The data is loaded into 4 tables - full set, test set, training set, and the validation set:
<li>DBM2_RFULL_TBL</li>
<li>DBM2_RTEST_TBL</li>
<li>DBM2_RTRAINING_TBL</li>
<li>DBM2_RVALIDATION_TBL</li>

To do that, a connection is created and passed to the loader.

There is a config file, <b>config/e2edata.ini</b> that controls the connection parameters and whether or not to reload the data from scratch.  In case the data is already loaded, there would be no need to load the data.  A sample section is below.  If the config parameter, reload_data is true then the tables for test, training, and validation are (re-)created and data inserted into them.

#########################<br>
[hana]<br>
url=host.sjc.sap.corp<br>
user=username<br>
passwd=userpassword<br>
port=3xx15<br>
<br>

#########################<br>

In [2]:
from data_load_utils import DataSets, Settings
url, port, user, pwd = Settings.load_config("../../config/e2edata.ini")
connection_context = dataframe.ConnectionContext(url, port, user, pwd)
full_tbl, training_tbl, validation_tbl, test_tbl = DataSets.load_bank_data(connection_context)

Table DBM2_RFULL_TBL exists and data exists


### Simple DataFrame

In [3]:
dataset1 = connection_context.table(training_tbl)
# Alternatively, it could be any SELECT
#dataset1 = connection_context.sql('SELECT * FROM "{0}"'.format(training_tbl))
print(dataset1.select_statement)

SELECT * FROM "DBM2_RTRAINING_TBL"


In [4]:
print(dataset1)

<hana_ml.dataframe.DataFrame object at 0x000001F789BCE670>


### Drop duplicates

In [5]:
dataset2 = dataset1.drop_duplicates()
print(dataset2.select_statement)

SELECT DISTINCT * FROM (SELECT * FROM "DBM2_RTRAINING_TBL") AS "DT_3"


In [6]:
print(dataset2.columns)

['ID', 'AGE', 'JOB', 'MARITAL', 'EDUCATION', 'DBM_DEFAULT', 'HOUSING', 'LOAN', 'CONTACT', 'DBM_MONTH', 'DAY_OF_WEEK', 'DURATION', 'CAMPAIGN', 'PDAYS', 'PREVIOUS', 'POUTCOME', 'EMP_VAR_RATE', 'CONS_PRICE_IDX', 'CONS_CONF_IDX', 'EURIBOR3M', 'NREMPLOYED', 'LABEL']


### Drop a column

In [7]:
dataset3 = dataset2.drop(["LABEL"])
print(dataset3.select_statement)

SELECT "ID", "AGE", "JOB", "MARITAL", "EDUCATION", "DBM_DEFAULT", "HOUSING", "LOAN", "CONTACT", "DBM_MONTH", "DAY_OF_WEEK", "DURATION", "CAMPAIGN", "PDAYS", "PREVIOUS", "POUTCOME", "EMP_VAR_RATE", "CONS_PRICE_IDX", "CONS_CONF_IDX", "EURIBOR3M", "NREMPLOYED" FROM (SELECT DISTINCT * FROM (SELECT * FROM "DBM2_RTRAINING_TBL") AS "DT_3") AS "DT_5"


### Take null values and substitute with a specific value

In [8]:
dataset4 = dataset2.fillna(25, ["AGE"])
print(dataset4.select_statement)

SELECT "ID", COALESCE("AGE", 25) AS "AGE", "JOB", "MARITAL", "EDUCATION", "DBM_DEFAULT", "HOUSING", "LOAN", "CONTACT", "DBM_MONTH", "DAY_OF_WEEK", "DURATION", "CAMPAIGN", "PDAYS", "PREVIOUS", "POUTCOME", "EMP_VAR_RATE", "CONS_PRICE_IDX", "CONS_CONF_IDX", "EURIBOR3M", "NREMPLOYED", "LABEL" FROM (SELECT DISTINCT * FROM (SELECT * FROM "DBM2_RTRAINING_TBL") AS "DT_3") dt


### Fetch 5 rows into client

In [9]:
print(dataset4.head(5).collect())

      ID  AGE         JOB   MARITAL          EDUCATION DBM_DEFAULT HOUSING  \
0   2647   27  technician    single        high.school     unknown      no   
1  31297   34  technician    single  university.degree          no     yes   
2  17777   36      admin.   married  university.degree     unknown      no   
3  29285   55  management   married  university.degree          no     yes   
4  38811   80     retired  divorced            unknown          no     yes   

  LOAN    CONTACT DBM_MONTH  ... CAMPAIGN  PDAYS  PREVIOUS     POUTCOME  \
0  yes  telephone       may  ...        2    999         0  nonexistent   
1   no  telephone       may  ...        3    999         0  nonexistent   
2   no   cellular       jul  ...        1    999         0  nonexistent   
3   no   cellular       apr  ...        2    999         0  nonexistent   
4  yes   cellular       nov  ...        2      3         1      success   

   EMP_VAR_RATE CONS_PRICE_IDX  CONS_CONF_IDX  EURIBOR3M  NREMPLOYED  LABEL  
0 

### Fetch columns in a DataFrame

In [10]:
print(dataset4.columns)

['ID', 'AGE', 'JOB', 'MARITAL', 'EDUCATION', 'DBM_DEFAULT', 'HOUSING', 'LOAN', 'CONTACT', 'DBM_MONTH', 'DAY_OF_WEEK', 'DURATION', 'CAMPAIGN', 'PDAYS', 'PREVIOUS', 'POUTCOME', 'EMP_VAR_RATE', 'CONS_PRICE_IDX', 'CONS_CONF_IDX', 'EURIBOR3M', 'NREMPLOYED', 'LABEL']


In [11]:
print(dataset4.head(10).collect())

      ID  AGE          JOB   MARITAL            EDUCATION DBM_DEFAULT HOUSING  \
0   2647   27   technician    single          high.school     unknown      no   
1  31297   34   technician    single    university.degree          no     yes   
2  17777   36       admin.   married    university.degree     unknown      no   
3  29285   55   management   married    university.degree          no     yes   
4  38811   80      retired  divorced              unknown          no     yes   
5   7814   43   unemployed   married    university.degree     unknown      no   
6  31733   33       admin.    single    university.degree          no     yes   
7  37085   51   management  divorced    university.degree          no      no   
8  18363   46   technician  divorced  professional.course          no     yes   
9  21398   44  blue-collar   married             basic.6y          no      no   

  LOAN    CONTACT DBM_MONTH  ... CAMPAIGN  PDAYS  PREVIOUS     POUTCOME  \
0  yes  telephone       may  ... 

In [12]:
print(dataset4.filter('AGE > 60').head(10).collect())

      ID  AGE        JOB   MARITAL            EDUCATION DBM_DEFAULT HOUSING  \
0  29830   69    retired  divorced             basic.4y          no      no   
1  36021   61    unknown    single             basic.4y          no     yes   
2  30030   64    retired   married    university.degree          no     yes   
3  28514   61    retired   married    university.degree          no     yes   
4  28726   69    retired   married              unknown          no      no   
5  30134   79    retired   married             basic.9y          no     yes   
6  30391   71    retired  divorced             basic.4y          no     yes   
7  35862   66  housemaid   married          high.school          no     yes   
8  30242   81    retired   married  professional.course          no      no   
9  35962   61    retired   married             basic.9y          no      no   

  LOAN    CONTACT DBM_MONTH  ... CAMPAIGN  PDAYS  PREVIOUS     POUTCOME  \
0   no   cellular       apr  ...        1    999       

In [13]:
pd1 = dataset4.filter('AGE>60').head(10).collect()

In [14]:
print(type(pd1))

<class 'pandas.core.frame.DataFrame'>


In [15]:
dataset4.filter('AGE>60').sort(['AGE'])

<hana_ml.dataframe.DataFrame at 0x1f789c2a430>

In [16]:
print(dataset4.filter('AGE>60').sort(['AGE']).head(1).collect())

      ID  AGE        JOB  MARITAL EDUCATION DBM_DEFAULT HOUSING LOAN  \
0  41000   61  housemaid  married  basic.4y          no     yes   no   

     CONTACT DBM_MONTH  ... CAMPAIGN  PDAYS  PREVIOUS  POUTCOME  EMP_VAR_RATE  \
0  telephone       oct  ...        2    999         2   failure          -1.1   

  CONS_PRICE_IDX  CONS_CONF_IDX  EURIBOR3M  NREMPLOYED  LABEL  
0         94.601          -49.5      1.016        4963     no  

[1 rows x 22 columns]


In [17]:
condition = '{}."ID"={}."ID"'.format(dataset4.quoted_name, dataset2.quoted_name)
dataset5 = dataset4.join(dataset2, condition)

In [18]:
print(dataset5.head(1).collect())

     ID  AGE     JOB  MARITAL    EDUCATION DBM_DEFAULT HOUSING LOAN  \
0  8746   54  admin.  married  high.school          no     yes   no   

     CONTACT DBM_MONTH  ... CAMPAIGN  PDAYS  PREVIOUS     POUTCOME  \
0  telephone       jun  ...        1    999         0  nonexistent   

   EMP_VAR_RATE CONS_PRICE_IDX  CONS_CONF_IDX  EURIBOR3M  NREMPLOYED  LABEL  
0           1.4         94.465          -41.8      4.866        5228     no  

[1 rows x 44 columns]


In [19]:
dataset6 = dataset4.select("ID", "AGE", "JOB")

In [20]:
print(dataset6.head().collect())

     ID  AGE     JOB
0  8746   54  admin.


In [21]:
dataset7 = dataset4.select("ID", "AGE", "JOB", ('"AGE"*2', "TWICE_AGE"))

In [22]:
print(dataset7.head().collect())

     ID  AGE     JOB  TWICE_AGE
0  8746   54  admin.        108


In [23]:
dataset7.save("#MYTEST")

<hana_ml.dataframe.DataFrame at 0x1f789c2af70>

In [24]:
dataset8 = connection_context.table("#MYTEST")

In [25]:
print(dataset8.head().collect())

ProgrammingError: (259, 'invalid table name:  Could not find table/view #MYTEST in schema PAL_TEST: line 1 col 36 (at pos 35)')