# Formatting gold corpora
The entries annotated with LabelStudio, which are output as a JSON by that program, are formatted in this notebook to two options: an EUPEG standard XML file and a Pandas dataframe, which is saved as a CSV.

In [2]:
import pandas as pd, numpy as np

#### Select which corpus to work on. Comment out the other:

In [38]:
corpus = "fingernews"
#corpus = "fingertweets"

## Formatting as a CSV
Reading in data from the JSON, forming a dataframe that's similar to Finger's output dataframe. This allows evaluation more easily. The CSV can either be "exploded", which means that every unique toponym is on its own row, or alternatively one document can be in one row and the toponyms, coordinates etc. are in a list. Anonymization can be applied, which means that the original texts are excluded and replaced with Tweet id's or URL's for Fingertweets and Fingernews respectively.

In [56]:
def create_gold_corpus(df, corpus="fingernews", explode_df=True, anon=False):
    df = df.sort_values(by='id')
    data = df['data'].tolist()
    anno = df['annotations'].tolist()
    
    entry_dicts = []
    for annotations, texts in zip(anno, data):
        entry = annotations[0]

        
        if corpus=="fingernews":
            identification = texts['url']
            text = texts['text']
        else:
            identification = texts['id_str']
            text = texts['full_text']
        #entry_dict = {'text':text, 'url':url}
        loc_spans = []
        locations = []
        coord_points = []
    
        for res in entry['result']:
        #print(res)
            value = res['value']
            coords = res['meta']['text']
            
            
            if coords[0].lower() == 'nan':
                formatted_coords = None
            else:
            # coordinates are stored as a string, annoyingly
                split = coords[0].split(',')
            # latlon -> lonlat
                split.reverse()
            # remowing whitespace
            #formatted_coords = " ".join(split).lstrip()
                formatted_coords = tuple([float(coord) for coord in split])
        
            loc_span = (value['start'], value['end'])
        
            location = value['text']
        
            coord_point = formatted_coords
        
            loc_spans.append(loc_span)
            locations.append(location)
            coord_points.append(coord_point)
        
    #entry_dict = {'text':text, 'url':url, 'toponyms':toponyms}
        if anon:
            entry_dict = {'input_text':identification, 'locations':locations, 'loc_spans':loc_spans, 'coord_points':coord_points}
        else:
            entry_dict = {'input_text':text, 'id':identification, 'locations':locations, 'loc_spans':loc_spans, 'coord_points':coord_points}
        entry_dicts.append(entry_dict)
    
    df_gold = pd.DataFrame(entry_dicts)
    #if corpus=="fingernews":
    #    df_gold.sort_values(by='input_text', inplace=True)
    #    df_gold.reset_index(inplace=True, drop=True)
    
    if explode_df:
        df_gold = df_gold.explode(['locations', 'loc_spans', 'coord_points'])

    df_gold['input_order'] = df_gold.index
    df_gold.replace([np.nan], [None], inplace=True)
    
    return df_gold

In [57]:
annotated = pd.read_json("./input_data/{}_annotated.json".format(corpus))

In [58]:
formatted = create_gold_corpus(annotated, corpus=corpus, explode_df=True, anon=False)

In [60]:
formatted.to_csv("./input_data/{}_gold_df.csv".format(corpus))

## Formatting as EUPEG standard XML
The functions below will finally print a correctly formatted XML. Copy + paste that to a text editor and save it as a XML file (or find a way to save it directly to a XML without breaking anything, I didn't).

In [43]:
from dicttoxml import dicttoxml
from xml.dom.minidom import parseString

In [44]:
def format_eupeg_dict(data, anno, corpus, anon=False):
    """This function creates a list of dictionaries from lists of input data and annotations, extracted from LabelStudio JSON's.
    The keys follow the naming format set in the EUPEG geocorpus evaluation platform. 
    This means that they can be transformed to the XML format defined by EUPEG."""
    # create EUPEG formatting
    entry_dicts = []
    for annotations, texts in zip(anno, data):
        entry = annotations[0]

        if corpus== "fingernews":
            identification = texts['url']
            text = texts['text']
        else:
            identification = texts['id_str']
            text = texts['full_text']
            
        #text = texts['full_text']
        #tweet_id = texts['id_str']
        #entry_dict = {'text':text, 'url':url}
        toponyms = []

        for res in entry['result']:
            value = res['value']
            coords = res['meta']['text']

            split = coords[0].split(',')
            reverse = split.reverse()
            formatted_coords = " ".join(split).lstrip()
            toponym = {'start':value['start'],'end':value['end'],'phrase':value['text'],
                       'place':{'footprint':formatted_coords}}
            toponyms.append(toponym)

        #entry_dict = {'text':text, 'url':url, 'toponyms':toponyms}
        if anon:
            entry_dict = {'text':identification, 'toponyms':toponyms}
        else:
            entry_dict = {'text':text, 'toponyms':toponyms}
        entry_dicts.append(entry_dict)
    return entry_dicts

In [45]:
# reading in labelstudio's JSON output
annotated = pd.read_json("./input_data/{}_annotated.json".format(corpus))
annotated = annotated.sort_values(by='id')

# the input texts are under "data" header and annotations under "annotations"
# extracting those as lists
data = annotated['data'].tolist()
anno = annotated['annotations'].tolist()

In [50]:
entry_dicts = format_eupeg_dict(data,anno,corpus, anon=False)

The dictionaries must still be output as XML. I explored a few options and this was the first one that worked.
1. Using a simple dicttoxml library to as it says on the tin
    - requires a few tricks, such as the function to ensure that the sub-levels are named correctly
2. That output is kinda ugly, parse it to a string
3. That is then printed as a pretty object: I copy + pasted that printed string to a text editor and saved it as XML (not the optimal solution, but it works)

In [51]:
def my_item_func(x):
    if x=='entries':
        return "entry"
    if x=='toponyms':
        return "toponym"

In [52]:
xml = dicttoxml(entry_dicts, attr_type=False, custom_root='entries', item_func=my_item_func)

In [None]:
dom = parseString(xml)

print(dom.toprettyxml())