# Libraries & Functions

In [6]:
import numpy as np
import pandas as pd
from pandas.errors import EmptyDataError

In [7]:
import re
import os
import glob
from tqdm import tqdm

In [8]:
def get_ann(ann_filename):
    def split_offsets(row):
	    items = row["offsets"].split(" ")
	    row["type"] = items[0]    
	    row["start"] = int(items[1])
	    row["end"] = int(items[-1])
	    return row
	    
    try:
        doc_ann = pd.read_csv(ann_filename, sep = "\t", header=None)
    except EmptyDataError:
        return pd.DataFrame()
    else:
        if not doc_ann.empty:
            doc_ann.columns = ["index", "offsets", "text"]
            doc_ann = doc_ann.set_index("index")
            doc_ann = doc_ann.dropna()
            doc_ann = doc_ann.apply(split_offsets, axis=1)
            doc_ann = doc_ann.drop(["offsets"], axis=1)
            doc_ann = doc_ann[(doc_ann['type'] == "Taxon") | (doc_ann['type'] == "Microorganism") | (doc_ann['type'] == "LIVB") | (doc_ann['type'] == "Species") ]
            doc_ann = doc_ann.replace("Taxon", "LIVB")
            doc_ann = doc_ann.replace("Microorganism", "LIVB")
    return doc_ann

In [52]:
def annotate_txt(data_txt, data_ann):
    def annotate_token(string, index_start, index_end, token):
        return string[:index_start] + '[' + string[index_start:index_end] + "](" + token + ")" + string[index_end:]

    data_ann = data_ann.sort_values(["start"], ascending = False)
    for row in data_ann.values:
        if(data_txt[int(row[2]): int(row[3])] != row[0]):
            print("Something is off")
        data_txt = annotate_token(data_txt, row[2], row[3], row[1])    
    return data_txt

In [10]:
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []
    token_list = []
    entity_list = []
    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
                token_list.append(raw_entity_token)
                entity_list.append(entity_name)
        else:
            tokens_with_entities.append((raw_token, "O"))
            token_list.append(raw_token)
            entity_list.append("O")
            
    filtered_token_list = [i[0] for i in zip(token_list, entity_list) if i[0] != ""]
    filtered_entity_list = [i[1] for i in zip(token_list, entity_list) if i[0] != ""]
    return filtered_token_list, filtered_entity_list #tokens_with_entities

In [158]:
from datasets import Dataset, DatasetDict, ClassLabel, Sequence

def prepare_data(dataset_tokens, dataset_entities):
    data = []
    for i, (token, entity) in enumerate(zip(dataset_tokens, dataset_entities)):
        data.append( 
            {"tokens":token, 
            "ner_tags":entity,
            "idx":i
            })
    df = pd.DataFrame(data)
    return Dataset.from_pandas(df)

# Input Data

## Example of Preprocessing Pipeline

In [48]:
PATH_TO_S800 = './corpora/S800_GSC_brat/s800'

In [49]:
file_list_txt = glob.glob(PATH_TO_S800 + "/train/*.txt")

In [50]:
len(file_list_txt)

560

In [15]:
for file_txt in file_list_txt:
    file_ann = file_txt[:-3] + "ann"
    with open(file_txt, "r") as f:
        data_txt = f.read().replace("\r\n", "\n")
        data_txt = re.sub(r'[^\x00-\x7F]+',' ', data_txt)
    data_ann = get_ann(file_ann)
    break

In [16]:
data_txt

'Methanoregula formicica sp. nov., a methane-producing archaeon isolated from methanogenic sludge. \n\nA novel methane-producing archaeon, strain SMSP(T), was isolated from an anaerobic, propionate-degrading enrichment culture that was originally obtained from granular sludge in a mesophilic upflow anaerobic sludge blanket (UASB) reactor used to treat a beer brewery effluent. Cells were non-motile, blunt-ended, straight rods, 1.0-2.6 mum long by 0.5 mum wide; cells were sometimes up to 7 mum long. Asymmetrical cell division was observed in rod-shaped cells. Coccoid cells (0.5-1.0 mum in diameter) were also observed in mid- to late-exponential phase cultures. Growth was observed between 10 and 40 ^0C (optimum, 30-33 ^0C) and pH 7.0 and 7.6 (optimum, pH 7.4). The G+C content of the genomic DNA was 56.2 mol%. The strain utilized formate and hydrogen for growth and methane production. Based on comparative sequence analyses of the 16S rRNA and mcrA (encoding the alpha subunit of methyl-coen

In [17]:
data_ann

Unnamed: 0_level_0,text,type,start,end
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T1,Methanoregula formicica,Species,0,23
T2,SMSP(T),Species,143,150
T3,SMSP(T),Species,1076,1083
T4,Methanoregula boonei 6A8,Species,1225,1249
T5,SMSP(T),Species,1423,1430
T6,Methanoregula boonei 6A8,Species,1435,1459
T7,SMSP(T),Species,1770,1777
T8,Methanoregula formicica,Species,1863,1886
T9,SMSP(T),Species,1918,1925


In [18]:
for row in data_ann.values:
    print(row[0], row[2], row[3], "_______", data_txt[int(row[2]): int(row[3])])

Methanoregula formicica 0 23 _______ Methanoregula formicica
SMSP(T) 143 150 _______ SMSP(T)
SMSP(T) 1076 1083 _______ SMSP(T)
Methanoregula boonei 6A8 1225 1249 _______ Methanoregula boonei 6A8
SMSP(T) 1423 1430 _______ SMSP(T)
Methanoregula boonei 6A8 1435 1459 _______ Methanoregula boonei 6A8
SMSP(T) 1770 1777 _______ SMSP(T)
Methanoregula formicica 1863 1886 _______ Methanoregula formicica
SMSP(T) 1918 1925 _______ SMSP(T)


In [19]:
data_txt_annotated = annotate_txt(data_txt, data_ann)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<?, ?it/s]


In [20]:
data_txt

'Methanoregula formicica sp. nov., a methane-producing archaeon isolated from methanogenic sludge. \n\nA novel methane-producing archaeon, strain SMSP(T), was isolated from an anaerobic, propionate-degrading enrichment culture that was originally obtained from granular sludge in a mesophilic upflow anaerobic sludge blanket (UASB) reactor used to treat a beer brewery effluent. Cells were non-motile, blunt-ended, straight rods, 1.0-2.6 mum long by 0.5 mum wide; cells were sometimes up to 7 mum long. Asymmetrical cell division was observed in rod-shaped cells. Coccoid cells (0.5-1.0 mum in diameter) were also observed in mid- to late-exponential phase cultures. Growth was observed between 10 and 40 ^0C (optimum, 30-33 ^0C) and pH 7.0 and 7.6 (optimum, pH 7.4). The G+C content of the genomic DNA was 56.2 mol%. The strain utilized formate and hydrogen for growth and methane production. Based on comparative sequence analyses of the 16S rRNA and mcrA (encoding the alpha subunit of methyl-coen

In [21]:
data_txt_annotated

'[Methanoregula formicica](Species) sp. nov., a methane-producing archaeon isolated from methanogenic sludge. \n\nA novel methane-producing archaeon, strain [SMSP(T)](Species), was isolated from an anaerobic, propionate-degrading enrichment culture that was originally obtained from granular sludge in a mesophilic upflow anaerobic sludge blanket (UASB) reactor used to treat a beer brewery effluent. Cells were non-motile, blunt-ended, straight rods, 1.0-2.6 mum long by 0.5 mum wide; cells were sometimes up to 7 mum long. Asymmetrical cell division was observed in rod-shaped cells. Coccoid cells (0.5-1.0 mum in diameter) were also observed in mid- to late-exponential phase cultures. Growth was observed between 10 and 40 ^0C (optimum, 30-33 ^0C) and pH 7.0 and 7.6 (optimum, pH 7.4). The G+C content of the genomic DNA was 56.2 mol%. The strain utilized formate and hydrogen for growth and methane production. Based on comparative sequence analyses of the 16S rRNA and mcrA (encoding the alpha 

In [22]:
token_list, entity_list = get_tokens_with_entities(data_txt_annotated)

In [23]:
len(token_list), len(entity_list)

(289, 289)

## Preprocessing

In [159]:
dataset_S800 = DatasetDict()
PATH_TO_S800 = './corpora/S800_GSC_brat/s800'

label_names_dict = {"O": 0, "B-Species": 1, "I-Species": 2}
print("Dataset Label Names :", list(label_names_dict.keys()))

dataset_entities_all = []
for set_type in tqdm(["train", "test", "dev"]):
    file_list_txt = glob.glob(PATH_TO_S800 + f"/{set_type}/*.txt")
    dataset_tokens = []
    dataset_entities = []

    for file_txt in file_list_txt:
        file_ann = file_txt[:-3] + "ann"
        with open(file_txt, "r") as f:
            data_txt = f.read().replace("\r\n", "\n")
            data_txt = re.sub(r'[^\x00-\x7F]+',' ', data_txt)
        data_ann = get_ann(file_ann)
        if(len(data_ann)==0):
            continue
        data_txt_annotated = annotate_txt(data_txt, data_ann)
        token_list, entity_list = get_tokens_with_entities(data_txt_annotated)
        entity_list = ["O" if entity=="B-cyt" else entity for entity in entity_list]
        entity_list = [label_names_dict[entity] for entity in entity_list]
        
        dataset_tokens.append(token_list)
        dataset_entities.append(entity_list)
        dataset_entities_all.append(entity_list)
        
    dataset_S800[set_type] = prepare_data(dataset_tokens, dataset_entities)
    dataset_S800[set_type] = dataset_S800[set_type].cast_column('ner_tags', Sequence(ClassLabel(num_classes=len(label_names_dict), names=list(label_names_dict.keys()))))

Dataset Label Names : ['O', 'B-Species', 'I-Species']


  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

Casting the dataset:   0%|          | 0/437 [00:00<?, ? examples/s]

 33%|████████████████████████████                                                        | 1/3 [00:03<00:07,  3.57s/it]

Casting the dataset:   0%|          | 0/125 [00:00<?, ? examples/s]

 67%|████████████████████████████████████████████████████████                            | 2/3 [00:04<00:02,  2.10s/it]

Casting the dataset:   0%|          | 0/63 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.76s/it]


## Save Dataset

In [160]:
dataset_S800.save_to_disk("./corpora/S800_GSC_brat/S800_HF")

Saving the dataset (0/1 shards):   0%|          | 0/437 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/125 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

In [147]:
dataset_S800 = DatasetDict().load_from_disk("./corpora/S800_GSC_brat/S800_HF")

In [148]:
raw_datasets["train"]["ner_tags"]

[[3, 0, 7, 0, 0, 0, 7, 0, 0],
 [1, 2],
 [5, 0],
 [0,
  3,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [5,
  0,
  0,
  0,
  0,
  3,
  4,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  7,
  0,
  0,
  0,
  0,
  5,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,

In [90]:
raw_datasets["train"].features["ner_tags"].feature

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [94]:
from datasets import ClassLabel
ClassLabel(num_classes=len(label_names_dict), names=list(label_names_dict.keys()), id=None)

ClassLabel(names=['O', 'B-Species', 'I-Species', 'B-cyt'], id=None)

In [93]:
len(label_names_dict)

4