In [1]:
print(f"SAP HANA Client for Python: {__import__("hana_ml").__version__}")

SAP HANA Client for Python: 2.23.25030800


The latest version and its documentation can be checked at https://pypi.org/project/hana-ml/#history

## Load the user connection data and connect to the SAP HANA database instance

👉🏻 **Before running the next cell** make sure:

1. The content of the [./temp_user.ini](./temp_user.ini) file is copied to a file [./user.ini](./user.ini) and 
2. the user and the password in `user.ini` file should be set to the user with database object creation.

In [2]:
import os, sys

file_path = './user.ini'  # Replace this with the path to your file

if os.path.exists(file_path):
    print(f"The file '{file_path}' exists.")
    from hana_ml.algorithms.pal.utility import Settings
    myhost, myport, myuser, mypwd = Settings.load_config(file_path)
    if myhost == '' or myport == 0 or myuser == '' or mypwd == '':
        print("At least one required connection parameter is empty.")
    else:
        print(f"Connection to SAP HANA db at {myhost}:{myport} \nwill be attempted for the user {myuser}")
else:
    sys.exit(f"The file '{file_path}' does not exist! Create and configure it first.")
    


The file './user.ini' exists.
Connection to SAP HANA db at c5889dd5-e0f6-4930-8408-94d53ca61dbf.hna0.prod-us10.hanacloud.ondemand.com:443 
will be attempted for the user SANDBOX


In [None]:
from hana_ml import dataframe as hdf

myconn=hdf.ConnectionContext(
    address=myhost, 
    port=myport, 
    user=myuser,
    password=mypwd
)
print(f"Connected to SAP HANA db version {myconn.hana_version()} \nat {myhost}:{myport} as user {myuser}")

Connected to SAP HANA db version 4.00.000.00.1740475414 (fa/CE2024.40) 
at c5889dd5-e0f6-4930-8408-94d53ca61dbf.hna0.prod-us10.hanacloud.ondemand.com:443 as user SANDBOX


## Create a schema for HANA-ML CodeJam

In [4]:
schemas=("TITANIC", "VECTORS")

In [5]:
for schema in schemas:
    if not myconn.has_schema(schema):
        myconn.create_schema(schema)
        print(f"Schema {schema} created")
    else: print (f"Schema {schema} already exists")

Schema TITANIC already exists
Schema VECTORS already exists


# Load Titanic dataset - Train part

In [8]:
import pandas as pd

In [10]:
df_titanic_train = pd.read_csv(
    'https://raw.githubusercontent.com/SAP-samples/sap-tech-bytes/2021-07-07-predictive-scenarios/data/original/train.csv'
).convert_dtypes()

In [11]:
df_titanic_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [12]:
df_titanic_train.columns=['PassengerId', 'Survived', 'PClass', 'Name', 'Gender', 'Age', 'SibSp', 'ParCh', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [13]:
df_titanic_train.dtypes

PassengerId             Int64
Survived                Int64
PClass                  Int64
Name           string[python]
Gender         string[python]
Age                   Float64
SibSp                   Int64
ParCh                   Int64
Ticket         string[python]
Fare                  Float64
Cabin          string[python]
Embarked       string[python]
dtype: object

In [14]:
df_titanic_train.describe(
    include='all'
)

Unnamed: 0,PassengerId,Survived,PClass,Name,Gender,Age,SibSp,ParCh,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [15]:
hdf_titanic_train=hdf.create_dataframe_from_pandas(
    connection_context=myconn, 
    pandas_df=df_titanic_train, 
    table_name='DATA_LABELED', schema='TITANIC',
    force=True, primary_key='PassengerId',
    # table_structure={#'PassengerId':'SMALLINT', 
    #                  #'Survived':'TINYINT', 
    #                  'PClass':'TINYINT', 
    #                  'Name':'VARCHAR(100)', 
    #                  'Gender':'VARCHAR(10)',
    #                  'Age':'REAL', 
    #                  'SibSp':'TINYINT', 
    #                  'ParCh':'TINYINT', 
    #                  'Ticket':'VARCHAR(30)', 
    #                  'Fare':'REAL', 
    #                  'Cabin':'VARCHAR(30)', 
    #                  'Embarked':'VARCHAR(1)'}
)

100%|██████████| 1/1 [00:03<00:00,  3.67s/it]


In [16]:
hdf_titanic_train.describe().collect()

Unnamed: 0,column,count,unique,nulls,mean,std,min,max,median,25_percent_cont,25_percent_disc,50_percent_cont,50_percent_disc,75_percent_cont,75_percent_disc
0,PassengerId,891,891,0,446.0,257.353842,1.0,891.0,446.0,223.5,223.0,446.0,446.0,668.5,669.0
1,Survived,891,2,0,0.383838,0.486592,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,PClass,891,3,0,2.308642,0.836071,1.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0
3,Age,714,88,177,29.699118,14.526497,0.42,80.0,28.0,20.125,20.0,28.0,28.0,38.0,38.0
4,SibSp,891,7,0,0.523008,1.102743,0.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
5,ParCh,891,7,0,0.381594,0.806057,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Fare,891,248,0,32.204208,49.693429,0.0,512.3292,14.4542,7.9104,7.8958,14.4542,14.4542,31.0,31.0
7,Name,891,891,0,,,,,,,,,,,
8,Gender,891,2,0,,,,,,,,,,,
9,Ticket,891,681,0,,,,,,,,,,,


# Load Titanic dataset - Test part

In [17]:
df_titanic_test = pd.read_csv(
    'https://raw.githubusercontent.com/SAP-samples/sap-tech-bytes/2021-07-07-predictive-scenarios/data/original/test.csv'
).convert_dtypes()

In [18]:
df_titanic_test.columns=['PassengerId', 'PClass', 'Name', 'Gender', 'Age', 'SibSp', 'ParCh', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [19]:
df_titanic_test.describe(
    include='all'
)

Unnamed: 0,PassengerId,PClass,Name,Gender,Age,SibSp,ParCh,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Kelly, Mr. James",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,39.0,1.0,0.0,,31.5,,


In [20]:
df_titanic_test.dtypes

PassengerId             Int64
PClass                  Int64
Name           string[python]
Gender         string[python]
Age                   Float64
SibSp                   Int64
ParCh                   Int64
Ticket         string[python]
Fare                  Float64
Cabin          string[python]
Embarked       string[python]
dtype: object

In [21]:
hdf_titanic_test=hdf.create_dataframe_from_pandas(
    connection_context=myconn, 
    pandas_df=df_titanic_test, 
    table_name='DATA_TO_PREDICT', schema='TITANIC',
    force=True, primary_key='PassengerId',
    # table_structure={'PassengerId':'SMALLINT',
    #                  'PClass':'TINYINT', 
    #                  'Name':'VARCHAR(100)', 
    #                  'Gender':'VARCHAR(10)',
    #                  'Age':'REAL', 
    #                  'SibSp':'TINYINT', 
    #                  'ParCh':'TINYINT', 
    #                  'Ticket':'VARCHAR(30)', 
    #                  'Fare':'REAL', 
    #                  'Cabin':'VARCHAR(30)', 
    #                  'Embarked':'VARCHAR(1)'}
)

100%|██████████| 1/1 [00:01<00:00,  1.05s/it]


In [22]:
hdf_titanic_test.describe().collect()

Unnamed: 0,column,count,unique,nulls,mean,std,min,max,median,25_percent_cont,25_percent_disc,50_percent_cont,50_percent_disc,75_percent_cont,75_percent_disc
0,PassengerId,418,418,0,1100.5,120.810457,892.0,1309.0,1101.0,996.25,996.0,1100.5,1100.0,1204.75,1205.0
1,PClass,418,3,0,2.26555,0.841837,1.0,3.0,3.0,1.0,1.0,3.0,3.0,3.0,3.0
2,Age,332,79,86,30.27259,14.181209,0.17,76.0,27.0,21.0,21.0,27.0,27.0,39.0,39.0
3,SibSp,418,7,0,0.447368,0.896759,0.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,ParCh,418,8,0,0.392344,0.981428,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Fare,417,169,1,35.627188,55.907576,0.0,512.3292,14.4542,7.8958,7.8958,14.4542,14.4542,31.5,31.5
6,Name,418,418,0,,,,,,,,,,,
7,Gender,418,2,0,,,,,,,,,,,
8,Ticket,418,363,0,,,,,,,,,,,
9,Cabin,91,76,327,,,,,,,,,,,


# Load Titanic dataset - Complete

In [23]:
df_titanic_complete = pd.read_csv(
    'https://raw.githubusercontent.com/SAP-samples/sap-tech-bytes/2021-07-07-predictive-scenarios/data/original/titanic3.csv'
).convert_dtypes()

In [24]:
df_titanic_complete.columns=['pclass', 'survived', 'name', 'gender', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home_dest']

In [25]:
df_titanic_complete[['pclass', 'survived', 'name', 'gender', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']].describe(
    include='all'
)

Unnamed: 0,pclass,survived,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
count,1309.0,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1308.0,295,1307
unique,,,1307,2,,,,929,,186,3
top,,,"Connolly, Miss. Kate",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,2,843,,,,11,,6,914
mean,2.294882,0.381971,,,29.881138,0.498854,0.385027,,33.295479,,
std,0.837836,0.486055,,,14.413493,1.041658,0.86556,,51.758668,,
min,1.0,0.0,,,0.17,0.0,0.0,,0.0,,
25%,2.0,0.0,,,21.0,0.0,0.0,,7.8958,,
50%,3.0,0.0,,,28.0,0.0,0.0,,14.4542,,
75%,3.0,1.0,,,39.0,1.0,0.0,,31.275,,


In [26]:
hdf_titanic_test.count() + hdf_titanic_train.count()

1309

In [27]:
df_titanic_complete.dtypes

pclass                Int64
survived              Int64
name         string[python]
gender       string[python]
age                 Float64
sibsp                 Int64
parch                 Int64
ticket       string[python]
fare                Float64
cabin        string[python]
embarked     string[python]
boat         string[python]
body                  Int64
home_dest    string[python]
dtype: object

Check if names are unique to use as a key in the database

In [28]:
df_titanic_complete[df_titanic_complete.duplicated('name', keep=False)]

Unnamed: 0,pclass,survived,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_dest
725,3,1,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,,Q,13.0,,Ireland
726,3,0,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,,,Ireland
924,3,0,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,,70.0,
925,3,0,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,,S,,,


In [29]:
hdf_titanic_complete=hdf.create_dataframe_from_pandas(
    connection_context=myconn, 
    pandas_df=df_titanic_complete[['pclass', 'survived', 'name', 'gender', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']], 
    table_name='DATA_COMPLETE', schema='TITANIC',
    force=True, primary_key=['name', 'ticket'],
    # table_structure={'pclass':'TINYINT', 
    #                  'survived':'TINYINT', 
    #                  'name':'VARCHAR(100)', 
    #                  'gender':'VARCHAR(10)',
    #                  'age':'REAL', 
    #                  'sibsp':'TINYINT', 
    #                  'parch':'TINYINT', 
    #                  'ticket':'VARCHAR(30)', 
    #                  'fare':'REAL', 
    #                  'cabin':'VARCHAR(30)', 
    #                  'embarked':'VARCHAR(1)'}
)

100%|██████████| 1/1 [00:01<00:00,  1.86s/it]


In [30]:
hdf_titanic_complete.describe().collect()

Unnamed: 0,column,count,unique,nulls,mean,std,min,max,median,25_percent_cont,25_percent_disc,50_percent_cont,50_percent_disc,75_percent_cont,75_percent_disc
0,pclass,1309,3,0,2.294882,0.837836,1.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0
1,survived,1309,2,0,0.381971,0.486055,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,age,1046,98,263,29.881138,14.413493,0.17,80.0,28.0,21.0,21.0,28.0,28.0,39.0,39.0
3,sibsp,1309,7,0,0.498854,1.041658,0.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,parch,1309,8,0,0.385027,0.86556,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,fare,1308,281,1,33.295479,51.758668,0.0,512.3292,14.4542,7.8958,7.8958,14.4542,14.4542,31.275,31.275
6,name,1309,1307,0,,,,,,,,,,,
7,gender,1309,2,0,,,,,,,,,,,
8,ticket,1309,929,0,,,,,,,,,,,
9,cabin,295,186,1014,,,,,,,,,,,


In [31]:
hdf_titanic_complete.get_table_structure()

{'pclass': 'INT',
 'survived': 'INT',
 'name': 'NVARCHAR(5000)',
 'gender': 'NVARCHAR(5000)',
 'age': 'DOUBLE',
 'sibsp': 'INT',
 'parch': 'INT',
 'ticket': 'NVARCHAR(5000)',
 'fare': 'DOUBLE',
 'cabin': 'NVARCHAR(5000)',
 'embarked': 'NVARCHAR(5000)'}