## Read source data from CSV files into Panda data frames

In [1]:
import pandas
pandas.__version__

'1.1.4'

--> Data courtesy of Kelvin Lawrence from https://github.com/krlawrence/graph/raw/master/sample-data/

In [2]:
dfp_nodes=pandas.read_csv('https://github.com/krlawrence/graph/raw/master/sample-data/air-routes-latest-nodes.csv')
dfp_edges=pandas.read_csv('https://github.com/krlawrence/graph/raw/master/sample-data/air-routes-latest-edges.csv')

In [3]:
print('Size of nodes dataframe: {}'.format(dfp_nodes.shape))
print('Size of edges dataframe: {}'.format(dfp_edges.shape))

Size of nodes dataframe: (3742, 16)
Size of edges dataframe: (57574, 5)


#### Create DFP with airports only

In [4]:
dfp_nodes.dtypes

~id                 int64
~label             object
type:string        object
code:string        object
icao:string        object
desc:string        object
region:string      object
runways:int       float64
longest:int       float64
elev:int          float64
country:string     object
city:string        object
lat:double        float64
lon:double        float64
author:string      object
date:string        object
dtype: object

In [5]:
dfp_nodes.groupby('~label').size()

~label
airport      3497
continent       7
country       237
version         1
dtype: int64

In [6]:
dfp_ports=(
           dfp_nodes[dfp_nodes['~label'].isin(['airport'])]
           .drop(['~label','type:string','author:string','date:string'], axis=1)
           .convert_dtypes()
          )

In [7]:
dfp_ports.columns=(dfp_ports.columns
                   .str.replace('~','')
                   .str.replace(':.*','')
                   .str.upper()
                  )

In [8]:
dfp_ports.dtypes

ID           Int64
CODE        string
ICAO        string
DESC        string
REGION      string
RUNWAYS      Int64
LONGEST      Int64
ELEV         Int64
COUNTRY     string
CITY        string
LAT        float64
LON        float64
dtype: object

#### Create DFP with routes only

In [9]:
dfp_edges.dtypes

~id           int64
~from         int64
~to           int64
~label       object
dist:int    float64
dtype: object

In [10]:
dfp_edges.groupby('~label').size()

~label
contains     6994
route       50580
dtype: int64

In [11]:
dfp_routes=dfp_edges[dfp_edges['~label'].isin(['route'])].drop(['~label'], axis=1).copy()

In [12]:
dfp_routes.columns=dfp_routes.columns.str.replace('~','').str.replace(':.*','').str.upper()

In [13]:
dfp_routes.dtypes

ID        int64
FROM      int64
TO        int64
DIST    float64
dtype: object

# Persist in SAP HANA Cloud

In [14]:
import hana_ml
hana_ml.__version__

'2.6.20110600'

In [15]:
hana_cloud_endpoint="8e1a286a-21d7-404d-8d7a-8c77d2a77050.hana.trial-eu10.hanacloud.ondemand.com:443"

In [16]:
hana_cloud_host, hana_cloud_port=hana_cloud_endpoint.split(":")

cchc=hana_ml.dataframe.ConnectionContext(port=hana_cloud_port,
                                         address=hana_cloud_host,
                                         user='HANAML',
                                         password='Super$ecr3t!',
                                         encrypt=True
                                        )

In [17]:
print(cchc.sql("SELECT SCHEMA_NAME, TABLE_NAME FROM TABLES WHERE SCHEMA_NAME='{schema_name}'"
                 .format(schema_name=cchc.get_current_schema()))
        .collect()
       )

Empty DataFrame
Columns: [SCHEMA_NAME, TABLE_NAME]
Index: []


In [18]:
dfh_ports=hana_ml.dataframe.create_dataframe_from_pandas(cchc, 
                                                         dfp_ports, "PORTS", 
                                                         force=True
                                                        )

100%|██████████| 1/1 [00:00<00:00,  2.75it/s]


In [19]:
dfh_routes=hana_ml.dataframe.create_dataframe_from_pandas(cchc, 
                                                          dfp_routes, 'ROUTES',
                                                          force=True)

100%|██████████| 2/2 [00:01<00:00,  1.26it/s]


In [20]:
print(cchc.sql("SELECT SCHEMA_NAME, TABLE_NAME FROM TABLES WHERE SCHEMA_NAME='{schema_name}'"
                 .format(schema_name=cchc.get_current_schema()))
        .collect()
       )

  SCHEMA_NAME TABLE_NAME
0      HANAML      PORTS
1      HANAML     ROUTES


In [21]:
print(dfh_ports.collect())

        ID CODE  ICAO                                               DESC  \
0        1  ATL  KATL  Hartsfield - Jackson Atlanta International Air...   
1        2  ANC  PANC                              Anchorage Ted Stevens   
2        3  AUS  KAUS             Austin Bergstrom International Airport   
3        4  BNA  KBNA                    Nashville International Airport   
4        5  BOS  KBOS                                       Boston Logan   
...    ...  ...   ...                                                ...   
3492  3493  KHT  OAKS                                      Khost Airport   
3493  3494  SYS  UERS                                  Saskylakh Airport   
3494  3495  AAA  NTGA                                       Anaa Airport   
3495  3496  GBI  VOGB                   Kalaburagi International Airport   
3496  3497  KVO  LYKV                                     Morava Airport   

      REGION  RUNWAYS  LONGEST  ELEV COUNTRY        CITY        LAT  \
0      US-GA    

In [22]:
display(dfh_ports.head(5).collect())

Unnamed: 0,ID,CODE,ICAO,DESC,REGION,RUNWAYS,LONGEST,ELEV,COUNTRY,CITY,LAT,LON
0,1,ATL,KATL,Hartsfield - Jackson Atlanta International Air...,US-GA,5,12390,1026,US,Atlanta,33.6367,-84.428101
1,2,ANC,PANC,Anchorage Ted Stevens,US-AK,3,12400,151,US,Anchorage,61.1744,-149.996002
2,3,AUS,KAUS,Austin Bergstrom International Airport,US-TX,2,12250,542,US,Austin,30.1945,-97.669899
3,4,BNA,KBNA,Nashville International Airport,US-TN,4,11030,599,US,Nashville,36.1245,-86.6782
4,5,BOS,KBOS,Boston Logan,US-MA,6,10083,19,US,Boston,42.3643,-71.005203


In [23]:
dfh_ports

<hana_ml.dataframe.DataFrame at 0x7f36e9384250>

In [24]:
dfh_ports.select_statement

'SELECT * FROM "PORTS"'

In [25]:
dfh_ports.head(5)

<hana_ml.dataframe.DataFrame at 0x7f36e8e46050>

In [26]:
dfh_ports.head(5).select_statement

'SELECT TOP 5 * FROM (SELECT * FROM "PORTS") dt'

### Data exploration

In [27]:
print(dfh_ports.columns)

['ID', 'CODE', 'ICAO', 'DESC', 'REGION', 'RUNWAYS', 'LONGEST', 'ELEV', 'COUNTRY', 'CITY', 'LAT', 'LON']


In [28]:
(
    dfh_ports
    .select("CODE", "DESC", "LONGEST", "COUNTRY", "CITY")
    .sort("LONGEST", desc=True)
    .head(3).collect()
)

Unnamed: 0,CODE,DESC,LONGEST,COUNTRY,CITY
0,BPX,Qamdo Bangda Airport,18045,CN,Bangda
1,ULY,Ulyanovsk East Airport,16404,RU,Ulyanovsk
2,RKZ,Shigatse Peace Airport,16404,CN,Xigaze


In [29]:
(
    dfh_ports
    .select("CODE", "DESC", "LONGEST", "COUNTRY", "CITY")
    .sort("LONGEST", desc=True)
    .head(3)
    .select_statement
)

'SELECT TOP 3 * FROM (SELECT * FROM (SELECT "CODE", "DESC", "LONGEST", "COUNTRY", "CITY" FROM (SELECT * FROM "PORTS") AS "DT_4") AS "DT_18" ORDER BY "LONGEST" DESC) dt'

In [30]:
(
    dfh_ports
    .select(
        "CODE", "DESC", "LONGEST", "COUNTRY", "CITY", "LAT", 
        ('ABS("LAT")', "ABSLAT")
    )
    .sort("ABSLAT", desc=True).head(3)
    .collect()
)

Unnamed: 0,CODE,DESC,LONGEST,COUNTRY,CITY,LAT,ABSLAT
0,LYR,"Svalbard Airport, Longyear",7608,NO,Longyearbyen,78.246101,78.246101
1,NAQ,Qaanaaq Airport,2953,GL,Qaanaaq,77.488602,77.488602
2,THU,Thule Air Base,9997,GL,Pituffik,76.531197,76.531197


In [31]:
(
    dfh_ports
    .agg(
        agg_list=[("max", "RUNWAYS", "MAXRUNWAYS")], 
        group_by="COUNTRY"
    )
    .sort("MAXRUNWAYS", desc=True)
    .head(7).collect()
)

Unnamed: 0,COUNTRY,MAXRUNWAYS
0,US,7
1,NL,6
2,DK,5
3,RU,5
4,IE,5
5,NZ,5
6,CA,5
