In [13]:
import rdflib

# Construction, enrichment and evaluation of the Solarchem Knowledge Graph

## Quick cleaning
Deleting fields that should be empty

In [38]:
!sed 's/"N\/A"//g' data/raw-catalystsdata.csv | sed 's/"-1"//g' | sed 's/"n\/a"//g' | sed 's/"-1.00"//g' | sed 's/"-1.0"//g' | sed 's/"0.00"//g' > data/catalystsdata.csv

## Mapping generation
Running Mapeathor to create the mapping file in YARRRML

In [116]:
!python -m mapeathor -i solarchem-mapping.xlsx -l yarrrml -o solarchem-mapping.yml

Generating mapping file
Your mapping file is in solarchem-mapping.yml


## KG construction
Running Morph-KGC to construct the KG with the mapping generated in the previous step and the data in `./data`.

In [117]:
!python -m morph_kgc config.ini

  fnml_df = fnml_df.applymap(str)
INFO | 2024-03-22 19:10:38,775 | 47 mapping rules retrieved.
  self.rml_df = self.rml_df.replace(r'^\s*$', np.nan, regex=True)
INFO | 2024-03-22 19:10:38,807 | Mapping partition with 37 groups generated.
  f"{self.rml_df['mapping_partition'].value_counts()[0]}.")
INFO | 2024-03-22 19:10:38,808 | Maximum number of rules within mapping group: 3.
INFO | 2024-03-22 19:10:38,808 | Mappings processed in 1.044 seconds.
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.appl

  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
  data = data.applymap(str)
INFO | 2024-03-22 19:10:42,803 | Number of triples generated in total: 88873.
INFO | 2024-03-22 19:10:42,803 | Materialization finished in 3.992 seconds.


## KG validation
Quick validation to check that the KG is being generated with all the needed properties of the ontology.

In [118]:
g = rdflib.Graph()
g.parse("solarchem.nt")
print(len(g))

88873


### Articles

In [44]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>

    SELECT (COUNT (DISTINCT ?article) AS ?count)
    WHERE {
        ?article a bibo:Article .
    }

""")

for row in q_res:
    print(f"Number of articles: {row['count'].value}")

Number of articles: 1268


In [45]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>

    SELECT (COUNT (DISTINCT ?article) AS ?count)
    WHERE {
        ?article a bibo:Article ;
            bibo:doi ?doi ;
            bibo:abstract ?abs ;
            bibo:volume ?volume ;
            dc:title ?title ;
            bibo:issue ?issue ;
            dc:date ?date ;
            bibo:pageStart ?page .
    }

""")

for row in q_res:
    print(f"Number of articles with all properties: {row['count'].value}")

Number of articles with all properties: 421


In [46]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX solar: <http://w3id.org/solar/o/core#>

    SELECT (COUNT (DISTINCT ?article) AS ?count)
    WHERE {
        ?article a bibo:Article ;
            solar:hasExperimentExecution ?expexec .
    }

""")

for row in q_res:
    print(f"Number of articles with experiments associated: {row['count'].value}")

Number of articles with experiments associated: 1096


### Experiment Executions

In [75]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX solar: <http://w3id.org/solar/o/core#>

    SELECT (COUNT (DISTINCT ?input) AS ?count)
    WHERE {
        ?expexec a solar:ExperimentExecution ;
            prov:used ?input .
    }

""")

for row in q_res:
    print(f"Number of experiments: {row['count'].value}")

Number of experiments: 9704


### Inputs

In [94]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX solar: <http://w3id.org/solar/o/core#>
    PREFIX solarpc: <http://w3id.org/solar/o/pc#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX qudt: <http://qudt.org/2.1/schema/qudt>

    SELECT (COUNT (DISTINCT ?input) AS ?count)
    WHERE {
        ?input a solar:Input ;
            rdfs:label ?label ;
            solar:hasRole solar:Catalyst ;
            qudt:numericValue ?value ;
            qudt:unit ?unit
    }

""")

for row in q_res:
    print(f"Number of catalyts with all properties: {row['count'].value}")

Number of catalyts with all properties: 6102


In [107]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX solar: <http://w3id.org/solar/o/core#>
    PREFIX solarpc: <http://w3id.org/solar/o/pc#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX qudt: <http://qudt.org/2.1/schema/qudt>

    SELECT (COUNT (DISTINCT ?cocatalyst) AS ?count) 
    WHERE {
        ?cocatalyst a solar:Input ;
            rdfs:label ?label ;
            solar:hasRole solar:Co-catalyst ;
            qudt:floatPercentage ?percent .
    }

""")

for row in q_res:
    print(f"Number of co-catalyts with all properties: {row['count'].value}")

Number of co-catalyts with all properties: 3239


In [112]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX solar: <http://w3id.org/solar/o/core#>
    PREFIX solarpc: <http://w3id.org/solar/o/pc#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX qudt: <http://qudt.org/2.1/schema/qudt>

    SELECT (COUNT (DISTINCT ?support) AS ?count) 
    WHERE {
        ?support a solar:Input ;
            rdfs:label ?label ;
            solar:hasRole solarpc:Support ;
            qudt:floatPercentage ?percent .
    }

""")

for row in q_res:
    print(f"Number of supports with all properties: {row['count'].value}")

Number of supports with all properties: 381


In [113]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX solar: <http://w3id.org/solar/o/core#>
    PREFIX solarpc: <http://w3id.org/solar/o/pc#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX qudt: <http://qudt.org/2.1/schema/qudt>

    SELECT (COUNT (DISTINCT ?dopant) AS ?count) 
    WHERE {
        ?dopant a solar:Input ;
            rdfs:label ?label ;
            solar:hasRole solar:Dopant ;
            qudt:floatPercentage ?percent .
    }

""")

for row in q_res:
    print(f"Number of dopants with all properties: {row['count'].value}")

Number of dopants with all properties: 766


In [121]:
q_res = g.query("""    
    PREFIX bibo: <http://purl.org/ontology/bibo/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX solar: <http://w3id.org/solar/o/core#>
    PREFIX solarpc: <http://w3id.org/solar/o/pc#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX qudt: <http://qudt.org/2.1/schema/qudt>

    SELECT (COUNT (DISTINCT ?dye) AS ?count) 
    WHERE {
        ?dye a solar:Input ;
            rdfs:label ?label ;
            solar:hasRole solarpc:Dye ;
            qudt:floatPercentage ?percent .
    }

""")

for row in q_res:
    print(f"Number of dyes with all properties: {row['count'].value}")

Number of dyes with all properties: 199
