# Import Necessary Libraries

In [1]:
import pandas as pd
from wks_utils import TopioWKSIngest

# Read the OSM POIs to ingest

In [2]:
osm_pois_df = pd.read_csv('osm20_pois_corfu.csv', sep='|')

In [3]:
osm_pois_df

Unnamed: 0,osm_id,name,category,type,lon,lat,alt_name,int_name,street,housenumber,...,fax,email,website,opening_hours,description,image,last_update,operator,wikipedia,wkt
0,N1029715504,Pool Bar,FOOD,BAR,19.829637,39.651711,,,,,...,,,,,,,,,,POINT(19.8296369 39.6517114)
1,N1029715509,Convenience Strore Marina Gouvia,SHOP,CONVENIENCE,19.854266,39.647386,,,,,...,,,,,,,,,,POINT(19.8542664 39.6473859)
2,N1029715541,Pool Bar,FOOD,BAR,19.855186,39.647694,,,,,...,,,,,,,,,,POINT(19.8551864 39.6476937)
3,N1085861173,Athena Supermarket,SHOP,CONVENIENCE,19.736595,39.671256,,,,,...,,,,,,,,,,POINT(19.736595 39.6712563)
4,N1088014445,Sunrise Bar,FOOD,BAR,20.096480,39.407453,,,,,...,,,,,,,,,,POINT(20.0964796 39.4074531)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1809,W94157483,Ναός Ευαγγελίστριας,POW,CHRISTIAN,20.077798,39.421260,,,,,...,,,,,,,,,,"MULTIPOLYGON(((20.0776678 39.4213156,20.077940..."
1810,W94157484,Ναός Υ. Θ. Οδηγητρίας,POW,CHRISTIAN,20.069492,39.423205,,,,,...,,,,,,,,,,"MULTIPOLYGON(((20.0693893 39.4232501,20.069602..."
1811,W94157485,Άγιος Αρσένιος,POW,CHRISTIAN,20.064120,39.423536,,,,,...,,,,,,,,,,"MULTIPOLYGON(((20.064032 39.4236303,20.0642696..."
1812,W94157489,Ναός Αγίων Θεοδώρων,POW,CHRISTIAN,20.073204,39.423109,,,,,...,,,,,,,,,,"MULTIPOLYGON(((20.0731079 39.4231635,20.073317..."


# Start Topio Ingest and collect schema similarities

In [4]:
topio_ingest = TopioWKSIngest(threshold=0.6)
topio_ingest.calculate_schema_similarities(osm_pois_df)

In [5]:
topio_ingest.get_schema_similarities()

{'osm_pois.yml': 0.88,
 'tomtom_premium.yml': 0.17,
 'herold_yellow_pages.yml': 0.04,
 'tomtom_road.yml': 0.04,
 'tomtom_sm.yml': 0.04,
 'nuts.yml': 0.0}

# Automatically Ingest Based on Maximum Similarity

In [6]:
ingested_df, mappings = topio_ingest.ingest_with_max_similarity()
ingested_df

Unnamed: 0,ID,NAME,CATEGORY,SUBCATEGORY,LON,LAT,SRID,INTERNATIONAL_NAME,STREET,WIKIPEDIA,...,DESCRIPTION,WEBSITE,LAST_UPDATE,OPERATOR,POSTCODE,COUNTRY,FAX,IMAGE,HOUSENUMBER,OTHER_TAGS
0,,Pool Bar,FOOD,,19.829637,39.651711,,,,,...,,,,,,,,,,
1,,Convenience Strore Marina Gouvia,SHOP,,19.854266,39.647386,,,,,...,,,,,,,,,,
2,,Pool Bar,FOOD,,19.855186,39.647694,,,,,...,,,,,,,,,,
3,,Athena Supermarket,SHOP,,19.736595,39.671256,,,,,...,,,,,,,,,,
4,,Sunrise Bar,FOOD,,20.096480,39.407453,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1809,,Ναός Ευαγγελίστριας,POW,,20.077798,39.421260,,,,,...,,,,,,,,,,
1810,,Ναός Υ. Θ. Οδηγητρίας,POW,,20.069492,39.423205,,,,,...,,,,,,,,,,
1811,,Άγιος Αρσένιος,POW,,20.064120,39.423536,,,,,...,,,,,,,,,,
1812,,Ναός Αγίων Θεοδώρων,POW,,20.073204,39.423109,,,,,...,,,,,,,,,,


## Column Mappings

In [7]:
mappings

Unnamed: 0,Ingested,Original
0,OPENING_HOURS,opening_hours
1,DESCRIPTION,description
2,LAST_UPDATE,last_update
3,HOUSENUMBER,housenumber
4,WIKIPEDIA,wikipedia
5,CATEGORY,category
6,OPERATOR,operator
7,POSTCODE,postcode
8,WEBSITE,website
9,COUNTRY,country


# Semi-Automatic Ingestion
## Read DKV_Berlin

In [8]:
dkv_berlin_df = pd.read_csv('DKV_Berlin.csv', sep=';')
topio_ingest = TopioWKSIngest(threshold=0.3)
topio_ingest.calculate_schema_similarities(dkv_berlin_df)
topio_ingest.get_schema_similarities()

{'herold_yellow_pages.yml': 1.0,
 'osm_pois.yml': 0.78,
 'tomtom_sm.yml': 0.67,
 'tomtom_premium.yml': 0.56,
 'tomtom_road.yml': 0.33,
 'nuts.yml': 0.22}

In [9]:
ingested_df, mappings = topio_ingest.ingest_with_user_selection('osm_pois.yml')
ingested_df

Unnamed: 0,ID,NAME,CATEGORY,SUBCATEGORY,LON,LAT,SRID,INTERNATIONAL_NAME,STREET,WIKIPEDIA,...,DESCRIPTION,WEBSITE,LAST_UPDATE,OPERATOR,POSTCODE,COUNTRY,FAX,IMAGE,HOUSENUMBER,OTHER_TAGS
0,1,TOTAL,,,13.37,52.53,,,,,...,,,,,,,,,,DKV - TOTAL Station 10115 Berlin Chausseestr...
1,2,ARAL,,,13.42,52.51,,,,,...,,,,,,,,,,DKV - ARAL Station 10179 Berlin Holzmarktstr...
2,3,CleanCar,,,13.42,52.51,,,,,...,,,,,,,,,,DKV - CleanCar Station 10179 Berlin Holzmark...
3,4,HEM,,,13.41,52.51,,,,,...,,,,,,,,,,DKV - HEM Station 10179 Berlin Holzmarktstr. 4
4,5,TOTAL,,,13.42,52.51,,,,,...,,,,,,,,,,DKV - TOTAL Station 10243 Berlin Holzmarktst...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,241,TOTAL,,,13.26,52.43,,,,,...,,,,,,,,,,DKV - TOTAL Station 14169 Berlin Berliner St...
241,242,ESSO,,,13.28,52.49,,,,,...,,,,,,,,,,DKV - ESSO Station 14193 Berlin Hubertusallee 1
242,243,ESSO,,,13.27,52.46,,,,,...,,,,,,,,,,DKV - ESSO Station 14195 Berlin Clayallee 90
243,244,STAR,,,13.29,52.45,,,,,...,,,,,,,,,,DKV - STAR Station 14195 Berlin Königin-Luis...


## Column Mappings

In [10]:
mappings

Unnamed: 0,Ingested,Original
0,NAME,name
1,CITY,city
2,LON,lon
3,ID,ID
4,LAT,lat
5,OTHER_TAGS,all_tags
6,PHONE,zip


# Default Ingestion

In [11]:
ingested_df = topio_ingest.ingest_default()
ingested_df

Unnamed: 0,ID,lon,lat,all_tags,name,zip,address,city,WKT
0,1,13.37,52.53,DKV - TOTAL Station 10115 Berlin Chausseestr...,TOTAL,10115,Chausseestr. 61,Berlin,POINT(13.3752222 52.5370079)
1,2,13.42,52.51,DKV - ARAL Station 10179 Berlin Holzmarktstr...,ARAL,10179,Holzmarktstr. 12 - 14,Berlin,POINT(13.4218919 52.5140381)
2,3,13.42,52.51,DKV - CleanCar Station 10179 Berlin Holzmark...,CleanCar,10179,Holzmarktstr. 5,Berlin,POINT(13.420068 52.51453)
3,4,13.41,52.51,DKV - HEM Station 10179 Berlin Holzmarktstr. 4,HEM,10179,Holzmarktstr. 4,Berlin,POINT(13.419633 52.514823)
4,5,13.42,52.51,DKV - TOTAL Station 10243 Berlin Holzmarktst...,TOTAL,10243,Holzmarktstr. 36 - 42,Berlin,POINT(13.4283684 52.5113173)
...,...,...,...,...,...,...,...,...,...
240,241,13.26,52.43,DKV - TOTAL Station 14169 Berlin Berliner St...,TOTAL,14169,Berliner Str. 15 a,Berlin,POINT(13.2656562 52.4359571)
241,242,13.28,52.49,DKV - ESSO Station 14193 Berlin Hubertusallee 1,ESSO,14193,Hubertusallee 1,Berlin,POINT(13.284873 52.494197)
242,243,13.27,52.46,DKV - ESSO Station 14195 Berlin Clayallee 90,ESSO,14195,Clayallee 90,Berlin,POINT(13.275345 52.460752)
243,244,13.29,52.45,DKV - STAR Station 14195 Berlin Königin-Luis...,STAR,14195,Königin-Luise-Str. 22 a,Berlin,POINT(13.2977727 52.4575506)
