# Extracting 'persname' and (specific context) from RIDGES Herb. 9.0 using graphANNIS
Created on Wed Sept 14 21:30 2022 by Henrik Schönemann </br>

specific context: dipl-token; dipl-context (15 left/right); pb_n; doc; dipl-ids

Before running the script you need to download the corpus-data from https://www.laudatio-repository.org/browse/corpus/PySSCnMB7CArCQ9CNKFY/corpora
https://github.com/korpling/graphANNIS <br/>
https://graphannis-python.readthedocs.io/en/stable/

## Modules

In [1]:
###Modules for showing progress and elapsed time
from progress.bar import Bar
from timeit import default_timer as timer
from datetime import timedelta
import time

###Modules for dealing with data
import networkx as nx
import pandas as pd
import re #regular expressions

###Module for removing any ANSI-Escape-Sequences in prompt/terminal
import os
os.system("")

###graphANNIS-Modules
from graphannis.cs import CorpusStorageManager
from graphannis.util import node_name_from_match

## Script

In [6]:
with CorpusStorageManager(db_dir=r'C:\Users\henri\OneDrive\Desktop\Wikibase-BA\Daten-Pipeline\corpus-data') as csm:
    
    """--- Section 1: Querying the corpus ---"""

    corp = 'RIDGES_Herbology_Version9.0'
    query = 'persname' ###AQL-Query
    #query = 'persname="Carl_von_Linné"' ###AQL-Query for testing; usually ignored
    
    print("Accessing corpus '" + corp + "' via graphANNIS:")
    print("Query: " + query)

    ###Querying corpus
    print('Querying... ')
    start_query = timer() ###Timer-Start for query
    pers = csm.find(corp, query, limit=100000) ###default limit = 10
    end_query = timer() ###Timer-End for query

    """--- Section 2: Iterating over matched nodes ---"""
    ###Constructing table as pandas DataFrame
    table = pd.DataFrame(columns=['persname', 'dipl', 'dipl_ctx_lr15', 'pb_n', 'doc', 'dipl-ids'])
    
    ###Iterating over matched nodes i in range(len(pers))
    print("Iterating over matched nodes...")
    start_table = timer() #Timer-Start for iterating and writing
    
    with Bar('Processing', max=len(pers)) as bar: #Displaying a progress-bar in prompt/terminal
        for i in range(len(pers)):
            """--- Section 2.1: Creating subgraph for node[i] ---"""
            
            ###Getting node-id from node-name
            node = node_name_from_match(pers[i])
            pers_subg = csm.subgraph(corp, node)
            
            """--- Section 2.2: Getting node-ids for all dipl-tokens in subgraph.(pers[i]) ---"""
            
            ###Constructing edgelist of subgraph(node[i]) as pandas Dataframe via networkx
            subg_edgelist = nx.to_pandas_edgelist(pers_subg)
            
            ###Transforming source-nodes in edgelist to node-ids
            subg_edgelist['source'] = subg_edgelist['source'].str.replace(r'salt:/', '')
            ###Getting all but one node-ids for dipl-tokens
            edgelist_dipl = subg_edgelist.loc[subg_edgelist['annis::component_name'] == 'dipl']
            
            ###Concatenating all dipl-node[i]-ids in one list & sorting
            dipl_nodes_list = edgelist_dipl['source'].values.tolist()
            dipl_nodes_list.sort()
            
            ###Infering the last dipl-node-id via splitting of penultimate dipl-node-id
            ###e.g. 'RIDGES_Herbology_Version9.0/BeschreibungDerGraeser-c27-31_1769_Schrebers#sTok2168_virtualSpan' -> 'sTok2168_virtualSpan'
            if len(dipl_nodes_list) == 0: #If only one dipl-node
                dipl_nodes = node
                dipl_node_penult = dipl_nodes[0].split('#')
            else: #If more then one dipl-node
                dipl_nodes = dipl_nodes_list
                dipl_node_penult = dipl_nodes[len(dipl_nodes)-1].split('#')
            
            ###Extracting the integer in second part of penultimate dipl-node-id and adding 1
            num = int(re.sub("\D", "", dipl_node_penult[1])) + 1
            ###Generating the last dipl-node-id by combining parts of penultimate dipl-node-id with the new integer
            dipl_node_last = dipl_node_penult[0] + "#sTok" + str(num) + "_virtualSpan"
            
            ###Adding the last dipl-node-id to the list of dipl-node-ids for match[i]
            dipl_nodes.append(dipl_node_last)
            
            table.at[i, 'dipl-ids'] = dipl_nodes
            
            """--- Section 2.3: Getting node-ids for all dipl_ctx-tokens (15 left and right) in subgraph.(pers[i]) ---"""

            dipl_ctx_subg = csm.subgraph(corp, dipl_nodes, ctx_left=15, ctx_right=15)
            dipl_ctx_subg_edgelist = nx.to_pandas_edgelist(dipl_ctx_subg)
            
            ###Transforming source-nodes in edgelist to node-ids
            dipl_ctx_subg_edgelist['source'] = dipl_ctx_subg_edgelist['source'].str.replace(r'salt:/', '')
            ###Getting all but one node-ids for dipl-tokens
            dipl_ctx_subg_edgelist = dipl_ctx_subg_edgelist.loc[dipl_ctx_subg_edgelist['annis::component_name'] == 'dipl']
            
            ###Concatenating all dipl_ctx-node[i]-ids in one list & sorting
            dipl_ctx_list = dipl_ctx_subg_edgelist['source'].values.tolist()
            dipl_ctx_list.sort()
            
            dipl_ctx_node_penult = dipl_ctx_list[len(dipl_ctx_list)-1].split('#')
            ###Extracting the integer in second part of penultimate dipl-node-id and adding 1
            num = int(re.sub("\D", "", dipl_ctx_node_penult[1])) + 1
            ###Generating the last dipl_ctx-node-id by combining parts of penultimate dipl-node-id with the new integer
            dipl_ctx__node_last = dipl_ctx_node_penult[0] + "#sTok" + str(num) + "_virtualSpan"
            
            ###Adding the last dipl-node-id to the list of dipl-node-ids for match[i]
            dipl_ctx_list.append(dipl_ctx__node_last)
            
            """--- Section 2.4: Creating subgraphs for all dipl-node[i]-ids and dipl_ctx[i]-ids---"""

            dipl_subg = csm.subgraph(corp, dipl_nodes)    
            
            dipl_ctx_subg = csm.subgraph(corp, dipl_ctx_list)
            

            tmp = [] #Used for dipl-tokens
            tmp_ctx = [] #Used for dipl_ctx-tokens
            page = False #Used for page number (pb_n)
            
            
            for n in dipl_subg.nodes():
                ###Extraction of all individual dipl-tokens for match[i]
                if "default_ns::dipl" in dipl_subg.nodes[n]:
                    dipl = dipl_subg.nodes[n]["default_ns::dipl"]
                    tmp.append(dipl)
                ###Extraction of persname-token for match[i]
                if "default_ns::persname" in dipl_subg.nodes[n]:
                    table.at[i, 'persname'] = dipl_subg.nodes[n]["default_ns::persname"]
                ###Extraction of pb_n-token for match[i]
                if "default_ns::pb_n" in dipl_subg.nodes[n]:
                    table.at[i, 'pb_n'] = dipl_subg.nodes[n]["default_ns::pb_n"]
                    page = True ###Set only if key("default_ns::pb_n") exists
            
            for n in dipl_ctx_subg.nodes():
                if "default_ns::dipl" in dipl_ctx_subg.nodes[n]:
                    dipl = dipl_ctx_subg.nodes[n]["default_ns::dipl"]
                    tmp_ctx.append(dipl)

            ###Joining of all individual dipl- and dipl_ctx-tokens for match i
            table.at[i, 'dipl'] = ' '.join(tmp)
            table.at[i, 'dipl_ctx_lr15'] = ' '.join(tmp_ctx)
            
            ###Write "NaN" in column 'pb_n' if no key 'pb_n' exists
            if page == False:
                table.at[i, 'pb_n'] = "NaN"
            
            ###Extraction of metadata::doc for match i
            ###Using first part of penultimate dipl-node-id
            ####e.g. 'RIDGES_Herbology_Version9.0/BeschreibungDerGraeser-c27-31_1769_Schrebers#sTok2168_virtualSpan'
            ####-> 'RIDGES_Herbology_Version9.0/BeschreibungDerGraeser-c27-31_1769_Schrebers'
            doc = dipl_node_penult[0].split('/') 
            table.at[i, 'doc'] = doc[1].replace(r'%C3%BC', 'ü') #'ü' only umlaut in string(doc)

            bar.next() #Next step in progress-bar
            
    bar.finish() #End of progress-bar
    end_table = timer() #Timer-End for iterating and writing
    
    """--- Section 3: Printing summary and writing output to file---"""

    table.info(verbose=False) #'verbose=False' to not print per column information
    print(table.head()) #Print first 5 rows
    
    table.to_csv(r'C:\Users\henri\OneDrive\Desktop\Wikibase-BA\Daten-Pipeline\persname3.1_result.csv')
    
    print("---")
    print("Elapsed time for querying: "+ str(timedelta(seconds=end_query-start_query)))
    print("Elapsed time for writing: "+ str(timedelta(seconds=end_table-start_table)))

Accessing corpus 'RIDGES_Herbology_Version9.0' via graphANNIS:
Query: persname
Querying... 
Iterating over matched nodes...


KeyboardInterrupt: 