# imports

In [1]:
import pandas as pd
import numpy as np

import re

pd.set_option('display.max_colwidth', None)

# data_load

In [2]:
df1 = pd.read_excel("Continual_Learning/G1.xlsx", index_col=0)
df2 = pd.read_excel("Continual_Learning/G2.xlsx", index_col=0)
df3 = pd.read_excel("Continual_Learning/G3.xlsx", index_col=0)

In [3]:
# Dropping any rows with NaN values
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()

In [4]:
df1[["tags", "text"]].head()

Unnamed: 0,tags,text
0,"8:16:chronic_disease,20:32:treatment",portal fibrosis by liver biopsy
1,22:34:treatment,Contra-indication to liver biopsy
2,",32:44:treatment,,",Have a stable weight since the liver biopsy was performed defined by no more than a 5 % loss of initial body weight
3,"26:38:treatment,",Subject agrees to have a liver biopsy performed after 24 weeks of treatment
4,",43:55:treatment,",Liver steatosis (on visual estimate or on liver biopsy) > 30%


# Preprocessing

- Creating the following tagging scheme for the NER task:


| Entity_name | Token |
| --- | --- |
| Other | 0 |
| treatment | 1 |
| chronic_disease | 2 |
| cancer | 3 |
| allergy_name | 4 |

In [5]:
entity_ids = {
    "treatment": 1,
    "chronic_disease": 2,
    "cancer": 3,
    "allergy_name": 4,
    }
    

In [10]:
def find_word_index(txt, word):
    pattern = re.compile(r'\b{}\b'.format(re.escape(word)))

    # Find the index of the element containing the pattern
    word_index = next((index for index, element in enumerate(txt) if pattern.search(element)), None)

    return word_index

def get_ner_tokens(row):

    # Few tags have leading and trailing commas, removing them
    tag = row.tags.strip(",").strip()    # start:end:name, start:end:name, ... (start and end are in character level)

    # removing leading and trailing whitespace
    txt = row.text


    try:
        txt = txt.split()
    except:
        print(tag, txt)
        return None, None

    

    # labeled every word as other
    labels = np.zeros(len(txt))

    # iterate over all tages and mark them with their token
    for t in tag.split(","):
        if t == "":
            continue
        start, end, name = t.split(":")

        # as first character is considered as 1 in the dataset, but in python it is 0
        start, end = int(start), int(end)
        start -= 1
        end -= 1

        exact_word = " ".join(txt)[start:end]

        n_exact_words = len(exact_word.split())

        # check if word is more than one word, if yes then get the index of the first word and save total number of words
        if n_exact_words > 1:

            exact_word = exact_word.split()[0]

            word_index = find_word_index(txt, exact_word)
            # word_index = txt.index(exact_word)
            try:
                for i in range(word_index+1, word_index+n_exact_words):
                    labels[i] = entity_ids[name]
            except:
                return None, None

        else:
            

            word_index = find_word_index(txt, exact_word)

            labels[word_index] = entity_ids[name]


    return txt, labels



In [11]:
df1["tokens"], df1["labels"] = zip(*df1.apply(get_ner_tokens, axis=1))
df2["tokens"], df2["labels"] = zip(*df2.apply(get_ner_tokens, axis=1))
df3["tokens"], df3["labels"] = zip(*df3.apply(get_ner_tokens, axis=1))

In [12]:
# Drop rows with None values
df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

In [13]:
df1.head()

Unnamed: 0,ID,tags,text,tokens,labels
0,NCT02105766,"8:16:chronic_disease,20:32:treatment",portal fibrosis by liver biopsy,"[portal, fibrosis, by, liver, biopsy]","[0.0, 2.0, 0.0, 0.0, 1.0]"
1,NCT03008070,22:34:treatment,Contra-indication to liver biopsy,"[Contra-indication, to, liver, biopsy]","[0.0, 0.0, 0.0, 1.0]"
2,NCT03008070,",32:44:treatment,,",Have a stable weight since the liver biopsy was performed defined by no more than a 5 % loss of initial body weight,"[Have, a, stable, weight, since, the, liver, biopsy, was, performed, defined, by, no, more, than, a, 5, %, loss, of, initial, body, weight]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,NCT03008070,"26:38:treatment,",Subject agrees to have a liver biopsy performed after 24 weeks of treatment,"[Subject, agrees, to, have, a, liver, biopsy, performed, after, 24, weeks, of, treatment]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,NCT02515708,",43:55:treatment,",Liver steatosis (on visual estimate or on liver biopsy) > 30%,"[Liver, steatosis, (on, visual, estimate, or, on, liver, biopsy), >, 30%]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"


In [14]:
df1.to_csv("processed_data/G1.csv", index=False)
df2.to_csv("processed_data/G2.csv", index=False)
df3.to_csv("processed_data/G3.csv", index=False)