In [9]:
import polars as pl
from rdflib import URIRef, Graph, Namespace
from rdflib.namespace import RDF, SKOS
from rdflib import Literal
from rdflib.namespace import XSD
from skosify import infer
import sentier_data_tools as sdt
from tqdm.notebook import tqdm

path = r"C:\Users\misc\Downloads\allCountries\allCountries.txt"

In [10]:
all_schema = pl.Schema({
    "geonameid":pl.Int64,
    "name":pl.String,
    "asciiname":pl.String,
    "alternatenames":pl.String,
    "latitude":pl.Float32,
    "longitude":pl.Float32,
    "feature_class":pl.String,
    "feature_code":pl.String,
    "country_code":pl.String,
    "cc2":pl.String,
    "admin1_code":pl.String,
    "admin2_code":pl.String,
    "admin3_code":pl.String,
    "admin4_code":pl.String,
    "population":pl.Int64,
    "elevation":pl.Int16,
    "dem":pl.Int64,
    "timezone":pl.String,
    "modification_date":pl.Date
})

In [11]:
world_frame = pl.scan_csv(source=path,has_header=False,separator='\t',schema=all_schema)

In [12]:
hierarchy_schema = pl.Schema({
    "parent":pl.Int64,
    "child":pl.Int64,
    "admin1_code":pl.String
})

hierarchy = pl.scan_csv("../../data/Location-Services/hierarchy.txt",schema=hierarchy_schema,separator='\t')

hierarchy_merged = world_frame.join(other=hierarchy,left_on="geonameid",right_on="parent").sql("select * from self where feature_code in ('PCLI', 'ADM1', 'RGN')").collect()

In [13]:
alt_schema = pl.Schema({
    "alternateNameId":pl.Int32,
    "geonameid":pl.Int64,
    "isolanguage":pl.String,
    "alternate_name":pl.String,
    "isPreferredName":pl.Int8,
    "isShortName":pl.Int8,
    "isColloquial":pl.Int8,
    "isHistoric":pl.Int8,
    "from":pl.String,
    "to":pl.String
})
alternate_names = pl.scan_csv(r"C:/Users/misc/Downloads/alternateNamesV2/alternateNamesV2.txt",schema=alt_schema,separator="\t")

alternate_names.sql("select * from self where geonameid = 10944373").collect()

hierarchy.sql('select * from self where parent = 10944373').collect()

In [14]:
#hierarchy_merged.sql("select name, feature_class, feature_code from self where feature_code in ('ADM1', 'ADM2')")
filtered_world = world_frame.sql("select * from self where feature_code in ('PCLI', 'PCLD', 'RGN', 'ADM1')")

In [15]:
filtered_alt_names = alternate_names.join(filtered_world, on="geonameid",how="outer").select(alternate_names.collect_schema().names()).collect()

  filtered_alt_names = alternate_names.join(filtered_world, on="geonameid",how="outer").select(alternate_names.collect_schema().names()).collect()


filtered_world.sql("select * from self where geonameid = 12060407")
#filtered_alt_names.sql("select * from self where geonameid = 3580718")

In [16]:
GEOSPACES = "https://sws.geonames.org/"
GN = Namespace("http://www.geonames.org/ontology#")

filtered_world = filtered_world.collect()

world = Graph()

for item in tqdm(filtered_world.iter_rows(),total=filtered_world.height):
    uri = URIRef(GEOSPACES + str(item[0]))
    world.add((
        uri,
        SKOS.prefLabel,
        Literal(Literal(item[1]))
    ))
    world.add((
        uri,
        RDF.type,
        SKOS.Concept
    ))
    
    world.add((
        uri,
        GN.countryCode,
        Literal(item[8])
    ))
    children = hierarchy.sql(f"select * from self where parent = {item[0]}").collect()
    if len(children) > 0:
        for child in children.iter_rows():
            if not filtered_world.filter(pl.col('geonameid') == child[1]).is_empty():
                world.add((
                    uri,
                    SKOS.narrower,
                    URIRef(GEOSPACES + str(child[1]))
                ))
    specific_alt_names = filtered_alt_names.sql(f"select * from self where geonameid = {item[0]}")
    if specific_alt_names.height > 0:
        for alt in specific_alt_names.iter_rows():
            if alt[4] == 1:
                world.add((
                    uri,
                    SKOS.prefLabel,
                    Literal(alt[3], lang=alt[2])
                ))
            else:
                world.add((
                    uri,
                    SKOS.altLabel,
                    Literal(alt[3], lang=alt[2])
                ))

infer.skos_hierarchical(world)
world.serialize(destination='../../data/Location-Services/faster_altnames.ttl')

  0%|          | 0/6581 [00:00<?, ?it/s]

TypeError: expected string or bytes-like object, got 'tuple'

from urllib.request import urlretrieve

url = "https://download.geonames.org/export/dump/AD.zip"
filename = "AD.zip"
path, response = urlretrieve(url, filename)

import zipfile
import os
temp_dir = os.path.join(os.curdir,"temp")
if not os.path.exists(temp_dir):
    os.mkdir(temp_dir)

hier_zip = os.path.realpath(os.path.join(temp_dir,"hierarchy.zip"))
path, response = urlretrieve(url, hier_zip)

zip_path = r"C:\Users\misc\Downloads\hierarchy.zip"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(os.path.realpath(temp_dir))

print(zip_path,'\n',os.path.realpath(temp_dir))