In [1]:
import ujson as json
import pandas as pd
import numpy as np
from time import sleep, time
import atexit
import subprocess
from py4j.java_gateway import JavaGateway

In [2]:
# Start java gateway
java_process = subprocess.Popen(
    ['java', '-jar', '-Dfile.encoding=UTF-8', 'corese-library-python-4.4.1.jar'])
sleep(1)
gateway = JavaGateway()

# Stop java gateway at the enf od script
def exit_handler():
    gateway.shutdown()
    print('\n' * 2)
    print('Gateway Server Stop!')

atexit.register(exit_handler)
# Import of class
Graph = gateway.jvm.fr.inria.corese.core.Graph
Load = gateway.jvm.fr.inria.corese.core.load.Load
Transformer = gateway.jvm.fr.inria.corese.core.transform.Transformer
QueryProcess = gateway.jvm.fr.inria.corese.core.query.QueryProcess
RDF = gateway.jvm.fr.inria.corese.core.logic.RDF
RESULTFORMAT = gateway.jvm.fr.inria.corese.core.print.ResultFormat
coreseFormat = gateway.jvm.fr.inria.corese.sparql.api.ResultFormatDef

def sparqlQuery(graph, query):
    """Run a query on a graph

    :param graph: the graph on which the query is executed
    :param query: query to run
    :returns: query result
    """
    exec = QueryProcess.create(graph)
    return exec.query(query)

def convert_sparql_to_json(mapping_object):
    """
    Transform sparql Java object map into standard JSON response
    :param mapping_object:
    :return:
    """
    sparql_formater = RESULTFORMAT.create(mapping_object)
    sparql_formater.setSelectFormat(coreseFormat.JSON_FORMAT)

    # Convert string to JSON
    json_convert = json.loads(sparql_formater.toString())
    return json_convert


def load(graph ,path):
    """Load a graph from a local file or a URL

    :param path: local path or a URL
    :returns: the graph load
    """

    ld = Load.create(graph)
    ld.parse(path)

    return graph

In [3]:
PREFIX = """prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix iawa:<https://www.iawa-website.org/core#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>
prefix pl_tax: <http://www.plant-taxonomy-kg.com/core#>
prefix sosa: <http://www.w3.org/ns/sosa/>
prefix wcvp: <http://www.wcvp-core.com/class#>
prefix txrfp: <http://taxref.mnhn.fr/lod/property/>
"""
def get_sample_id(name):
    return f"""{PREFIX}
SELECT ?sample WHERE {{
    ?sample pl_tax:identifiedTaxon ?taxon.
    ?taxon skos:prefLabel "{name}".
    ?classes txrfp:hasReferenceName ?taxon;
        rdfs:subClassOf ?genus.
    ?genus rdfs:label ?genus_name;
	rdfs:subClassOf ?family.
    ?family rdfs:label ?family_name.
}}
"""

In [4]:
graph = Graph()
graph = load(graph, "Dataset/result_iawa.ttl")
graph = load(graph, "Dataset/result_taxonomy.ttl")
graph = load(graph, "Dataset/result_observations.ttl")

In [44]:

def build_csv_kg(df_train, df_insidewood, df_sacha, graph):
    data = []
    not_lifted = []
    for row in df_train.index:
        
        result = convert_sparql_to_json(sparqlQuery(graph, get_sample_id(df_train["powo_taxon"][row])))["results"]["bindings"]
        result = [x["sample"]["value"].split("/")[-1] for x in result]
        if result:
            if len(result) > 1:
                if df_train["id_sample"][row] in result:
                    if ("BRS" in df_train["id_sample"][row]) or ("OLI" in df_train["id_sample"][row]) :
                        sample_id = f'{df_train["id_sample"][row]}'
                        for line in df_sacha[df_sacha["sample_id"] == sample_id].index:
                            data.append(list(df_sacha.loc[line]))
                    else:
                        sample_id = f'{result[0]}'
                        for line in df_insidewood[df_insidewood["sample_id"] == sample_id].index:
                            data.append(list(df_insidewood.loc[line]))
            else:
                if ("BRS" in df_train["id_sample"][row]) or ("OLI" in df_train["id_sample"][row]) :
                    sample_id = f'{df_train["id_sample"][row]}'
                    for line in df_sacha[df_sacha["sample_id"] == sample_id].index:
                        data.append(list(df_sacha.loc[line]))
                else:
                    sample_id = result[0]
                    for line in df_insidewood[df_insidewood["sample_id"] == sample_id].index:
                        data.append(list(df_insidewood.loc[line]))
        else:
            descriptor = []
            possible = set()
            for elt in df_insidewood.index:
                if df_train["powo_taxon"][row] in df_insidewood["original_data"][elt]:
                    possible.add((df_insidewood["sample_id"][elt], df_insidewood["taxon_id"][elt]))
            if len(possible) == 1:
                for x in possible:
                    if ("BRS" in x[0]) or ("OLI" in x[0]) :
                            sample_id = f'{x[0]}'
                            for line in df_sacha[df_sacha["sample_id"] == sample_id].index:
                                data.append(list(df_sacha.loc[line]))
                    else:
                        sample_id = f'{x[0]}'
                        for line in df_insidewood[df_insidewood["sample_id"] == sample_id].index:
                            data.append(list(df_insidewood.loc[line]))
            else:
                not_lifted.append(possible)
    print(len(df_train.index),len(not_lifted))                
    return data

def build_csv_kg2(df_train, df_insidewood, df_sacha):
    data = []
    for row in df_train.index:
        if ("BRS" in df_train["id_sample"][row]) or ("OLI" in df_train["id_sample"][row]) :
            sample_id = f'{df_train["id_sample"][row]}'
            for line in df_sacha[df_sacha["sample_id"] == sample_id].index:
                data.append(list(df_sacha.loc[line]))
        else:
            sample_id = f'IW-{df_train["id_sample"][row]}'
            for line in df_insidewood[df_insidewood["sample_id"] == sample_id].index:
                data.append(list(df_insidewood.loc[line]))
    return data

In [45]:
df_insidewood = pd.read_csv("insidewood.csv", header=0, sep=";")
df_sacha = pd.read_csv("sacha.csv", header=0, sep=";")
df_sacha.drop(columns=["collected_at"], inplace=True)

taxas = ["family", "genus"]
letters = ["A", "B", "C", "D"]
label = ['sample_id', 'taxon_id', 'collection', 'observed_property', 'feature_value', 'feature_presence', 'FOI', 'original_data']

for letter in letters:
    for taxa in taxas:
        print(f"export_train_test/export_{letter}/export_train_powo_{taxa}.csv")
        df_train = pd.read_csv(f"export_train_test/export_{letter}/export_train_powo_{taxa}.csv", header=0, sep=";")
        data = build_csv_kg(df_train, df_insidewood, df_sacha, graph)
        pd.DataFrame(data, columns=label).to_csv(f"subdata_{taxa}_{letter}.csv", index=False, encoding="utf-8", sep=";")


export_train_test/export_A/export_train_powo_family.csv
1506 78
export_train_test/export_A/export_train_powo_genus.csv
509 16
export_train_test/export_B/export_train_powo_family.csv
992 65
export_train_test/export_B/export_train_powo_genus.csv
282 9
export_train_test/export_C/export_train_powo_family.csv
328 38
export_train_test/export_C/export_train_powo_genus.csv
64 4
export_train_test/export_D/export_train_powo_family.csv
185 19
export_train_test/export_D/export_train_powo_genus.csv
48 4
