# Import Libraries

In [8]:
# Data
import pandas as pd
import numpy as np
import time

# Ontology
#!pip install owlready2 
#import owlready2 as owl
from owlready2 import *
import re  # To separate words based on capital letters in onto classes & to split search queries
#!pip install EMMOntoPy #Special EMMO package
# from ontopy import get_ontology
print("Import done")

Import done


## Ontology

In [9]:
"""Import Ontology & Select classes"""

# write the location to the ontology to the onto_path list
onto_path = ["https://raw.githubusercontent.com/hendelhendel/FAIR_Battery/main/Ontology/test2.owl"]
    # Note that all the ontology files has to be owl files.
#onto_path = ["https://raw.githubusercontent.com/hendelhendel/FAIR_Battery/main/Ontology/flowtest.owl"]
#onto_path = ["https://github.com/hendelhendel/FAIR_Battery/blob/main/Ontology/flowbatterytest.owl"]
    
# Import ontology
try:
    onto = get_ontology(onto_path[0]).load()
except:
    pass 

onto = get_ontology(onto_path[0]).load()

# Collecting classes from ontology in a list
class_raw = list(get_ontology(onto_path[0]).load().classes())

# select classes by Prefix, suffix, nametags
tag = 'ElectrochemicalFlowCell'
prefix = 'electrochemistry.'

ClassCleaner = lambda x : re.sub('_',  ' ',\
                                 re.sub(r"(?<=\w)([A-Z])", r" \1", \
                                 str(x).removesuffix(tag).removeprefix(prefix))) \
                                    if (str(x).find(prefix) != -1) else "!EMPTY CLASS" 
                            

class_select = list(map(ClassCleaner, filter(lambda x : (str(x).find(tag) != -1), class_raw)))

print("Your imported ontology with "  + str(len(class_select)) + " classes is ready to use")


Your imported ontology with 33 classes is ready to use


## Process Data

In [10]:
"""Import Zotero Data Base from Github"""

# Import raw data from github repository as dataframe
data_path = 'https://raw.githubusercontent.com/SanliFaez/FAIR-Battery-knowledgebase/main/Datamanagement/Data_Raw.csv'
df_raw = pd.read_csv(data_path)


In [11]:
"""Data Process Functions"""

# Function Text Cleaner
f_CleanText = lambda text : re.split("\. |\! |\? ", text.lower()) if (type(text) == str)  else ["Not Available"]
    # Cleans text split scentices.
    # Output is of strings per for every text inputed. 
    # Example: Input = sentence1.sentence2!sentence3? --> Output = [sentence1,sentence2,sentence3]
    
    
# Function Keyword Search    
f_SearchKeyword = lambda text, keyword : text if (text.find(keyword) != -1) else None 
    # Searches text for keyword
    # Input: text = string or list of strings, keyword = string or list strings
    # Example: 
        # text = [sentence1,sentence2,sentence_A,sentence_B]
        # keyword = len(text)*[A]
        # Output  = [0,0,sentence_A,0]
        
# Function Scan lists of lists with text
f_ScanList = lambda text, keyword: list(map(f_SearchKeyword, text , len(text)*[keyword]))  
    # Example: 
        # text = [[sentence1,sentence2],[sentence_A,sentence_B]]
        # keyword = [A]
        # Output  = [[0,0],[sentence_A,0]]
                                        
#f_ScanList = lambda text, keyword: f_SearchKeyword(text, 'keyword') #if (type(text) == str) else "appel"
#f_ScanList = lambda text, keyword: len(text) if (type(text) == str) else "appel"

In [12]:
"""Process Data 1"""
start = time.time() # Measure time

# New data frame to store processed data
df_processed = df_raw.copy()
SourceText = "Abstract Note" #Column name text source
ProcessedText = "Clean Abstract Note" # Column name processed text

# clean abstract data 
df_processed[ProcessedText] = df_raw[SourceText].map(f_CleanText)

# function to search data for ontology classes !! depends on df_processed !!
f_OntoSearch = lambda onto : list(map(f_ScanList, df_processed[ProcessedText], len(df_processed[ProcessedText])*[onto.lower()]))

# Search data ontology class and store in df
def f_AddSearchResults(df, ontology):
    for onto in ontology: 
        df[str(onto)] = f_OntoSearch(onto)

f_AddSearchResults(df_processed, class_select)

end = time.time() # Measure time
calc_time = end-start  # Measure time

print("Your " + str(len(df_processed)) + " articles are searched on "  + str(len(class_select)) + " ontology classes in " + str(calc_time) + " seconds.")
print("type df_processed to view data frame")

# Store processed data
df_processed.to_csv('ProcessedData_ReadVersion.csv', index = False, encoding='utf-8') #Usefull for looking trough the data
# For input in other notebooks, a pickle file is needed to keep data types
#df_processed.to_pickle('ProcessedData_ReadVersion.pickle')


Your 5644 articles are searched on 33 ontology classes in 0.49408435821533203 seconds.
type df_processed to view data frame


In [13]:
"""Process data 2"""

# Function Keyword Search    
f_SearchKeyword2 = lambda text, keyword : 'y' if (text.find(keyword) != -1) else 'n' 
    # Searches text for keyword
    # Input: text = string or list of strings, keyword = string or list strings
    # Example: 
        # text = [sentence1,sentence2,sentence_A,sentence_B]
        # keyword = len(text)*[A]
        # Output  = [n,n,y,n]
        # Function Scan lists of lists with text
f_ScanList2 = lambda text, keyword: list(map(f_SearchKeyword2, text , len(text)*[keyword]))  
    # Example: 
        # text = [[sentence1,sentence2],[sentence_A,sentence_B]]
        # keyword = [A]
        # Output  = [[n,n],[y,n]]
        
"""Process Data"""
start2 = time.time() # Measure time

# New data frame to store processed data
df_processed2 = df_raw.copy()
SourceText = "Abstract Note" #Column name text source
ProcessedText = "Clean Abstract Note" # Column name processed text

# clean abstract data 
df_processed2[ProcessedText] = df_raw[SourceText].map(f_CleanText)

# function to search data for ontology classes !! depends on df_processed !!
f_OntoSearch = lambda onto : list(map(f_ScanList2, df_processed2[ProcessedText], len(df_processed2[ProcessedText])*[onto.lower()]))

# Search data ontology class and store in df
def f_AddSearchResults(df, ontology):
    for onto in ontology: 
        df[str(onto)] = f_OntoSearch(onto)

f_AddSearchResults(df_processed2, class_select)

end2 = time.time() # Measure time
calc_time = end2-start2  # Measure time

print("Your " + str(len(df_processed2)) + " articles are searched on "  + str(len(class_select)) + " ontology classes in " + str(calc_time) + " seconds.")
print("type df_processed to view data frame")

# Store processed data for further use.
df_processed2.to_csv('ProcessedData.csv', index = False, encoding='utf-8')
df_processed2.to_pickle('ProcessedData.pickle')


Your 5644 articles are searched on 33 ontology classes in 0.4173307418823242 seconds.
type df_processed to view data frame


In [14]:
print("All data is processed. processedData.pickle will be used in the data search engine. If you want to read what is in this dataset, open ProcessedData.csv. If you want to know what exactly is found in each article open ProcessedData_Readversion.csv")

All data is processed. processedData.pickle will be used in the data search egine. If you want to read what is in this dataset, open ProcessedData.csv. If you want to know what exactly is found in each article open ProcessedData_Readversion.csv
