# imports

In [1]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk.tokenize import word_tokenize

from datasets import Dataset

pd.set_option('display.max_colwidth', None)

# data_load

In [2]:
df1 = pd.read_excel("Continual_Learning/G1.xlsx", index_col=0)
df2 = pd.read_excel("Continual_Learning/G2.xlsx", index_col=0)
df3 = pd.read_excel("Continual_Learning/G3.xlsx", index_col=0)

In [3]:
# Dropping any rows with NaN values
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()

In [4]:
df1[["tags", "text"]].head()

Unnamed: 0,tags,text
0,"8:16:chronic_disease,20:32:treatment",portal fibrosis by liver biopsy
1,22:34:treatment,Contra-indication to liver biopsy
2,",32:44:treatment,,",Have a stable weight since the liver biopsy was performed defined by no more than a 5 % loss of initial body weight
3,"26:38:treatment,",Subject agrees to have a liver biopsy performed after 24 weeks of treatment
4,",43:55:treatment,",Liver steatosis (on visual estimate or on liver biopsy) > 30%


# Preprocessing

- Creating the following tagging scheme for the NER task:


| Entity_name | Token |
| --- | --- |
| Other | 0 |
| treatment | 1 |
| chronic_disease | 2 |
| cancer | 3 |
| allergy_name | 4 |

In [5]:
entity_ids = {
    "treatment": 1,
    "chronic_disease": 2,
    "cancer": 3,
    "allergy_name": 4,
    }
    

In [6]:
def find_word_index(txt, word):
    pattern = re.compile(r'\b{}\b'.format(re.escape(word)))

    # Find the index of the element containing the pattern
    word_index = next((index for index, element in enumerate(txt) if pattern.search(element)), None)

    return word_index

def get_ner_tokens(row):

    # Few tags have leading and trailing commas, removing them
    tag = row.tags.strip(",").strip()    # start:end:name, start:end:name, ... (start and end are in character level)

    # removing leading and trailing whitespace
    txt = row.text


    try:
        # txt = txt.split()
        original_txt = txt
        txt = word_tokenize(row["text"])
    except:
        # print(tag, txt)
        return None, None

    

    # labeled every word as other
    labels = np.zeros(len(txt))

    # iterate over all tages and mark them with their token
    for t in tag.split(","):
        if t == "":
            continue
        start, end, name = t.split(":")

        # as first character is considered as 1 in the dataset, but in python it is 0
        start, end = int(start), int(end)
        start -= 1
        end -= 1

        exact_word = original_txt[start:end]

        n_exact_words = len(exact_word.split())

        # check if word is more than one word, if yes then get the index of the first word and save total number of words
        if n_exact_words > 1:

            exact_word = exact_word.split()[0]

            word_index = find_word_index(txt, exact_word)
            # word_index = txt.index(exact_word)
            try:
                for i in range(word_index, word_index+n_exact_words):
                    labels[i] = entity_ids[name]
            except:
                # print(txt, exact_word, word_index, n_exact_words)
                return None, None

        else:
            

            word_index = find_word_index(txt, exact_word)

            labels[word_index] = entity_ids[name]

    return txt, labels



In [7]:
df1["tokens"], df1["ner_tags"] = zip(*df1.apply(get_ner_tokens, axis=1))
df2["tokens"], df2["ner_tags"] = zip(*df2.apply(get_ner_tokens, axis=1))
df3["tokens"], df3["ner_tags"] = zip(*df3.apply(get_ner_tokens, axis=1))

In [8]:
df1.sample(5)

Unnamed: 0,ID,tags,text,tokens,ner_tags
3146,NCT02770547,"1:22:chronic_disease,,,",Uncontrolled Diabetes (HbA1C > 8.5 measured within 3 months prior to date of consent),"[Uncontrolled, Diabetes, (, HbA1C, >, 8.5, measured, within, 3, months, prior, to, date, of, consent, )]","[2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4235,NCT02532621,53:88:chronic_disease,Subject has a documented and unsuccessfully treated ipsilateral central venous stenosis as determined by imaging,"[Subject, has, a, documented, and, unsuccessfully, treated, ipsilateral, central, venous, stenosis, as, determined, by, imaging]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0]"
3368,NCT02272998,1:30:treatment,anti-hypertensive medications are permitted,"[anti-hypertensive, medications, are, permitted]","[1.0, 1.0, 0.0, 0.0]"
7311,NCT00840047,"101:104:treatment,105:108:treatment,109:111:treatment,123:126:treatment,","Participants will have had, or are scheduled to have clinical imaging evaluations which may include FDG PET CT, or CT, or MRI within 4 weeks of entry","[Participants, will, have, had, ,, or, are, scheduled, to, have, clinical, imaging, evaluations, which, may, include, FDG, PET, CT, ,, or, CT, ,, or, MRI, within, 4, weeks, of, entry]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
5998,NCT02553941,"1:11:cancer,25:40:treatment,,122:132:treatment",Malignancy treated with curative intent and with no known active disease present for >= 1 years before the first dose of study drug and felt to be at low risk for recurrence by treating physician,"[Malignancy, treated, with, curative, intent, and, with, no, known, active, disease, present, for, >, =, 1, years, before, the, first, dose, of, study, drug, and, felt, to, be, at, low, risk, for, recurrence, by, treating, physician]","[3.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [9]:
df1.iloc[1177]

ID                                                                                                         NCT01989546
tags                                                                                                   81:90:treatment
text                         for the duration of study participation, and for 30 days after completing study treatment
tokens      [for, the, duration, of, study, participation, ,, and, for, 30, days, after, completing, study, treatment]
ner_tags                                   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
Name: 1177, dtype: object

In [10]:
tokens, tags  =zip(*pd.DataFrame(df1.iloc[1177]).T.apply(get_ner_tokens, axis=1))
tokens, tags 

((['for',
   'the',
   'duration',
   'of',
   'study',
   'participation',
   ',',
   'and',
   'for',
   '30',
   'days',
   'after',
   'completing',
   'study',
   'treatment'],),
 (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),))

In [11]:
# Drop rows with None values
df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

In [12]:
df1["tokens"].iloc[0]

['portal', 'fibrosis', 'by', 'liver', 'biopsy']

In [14]:
df1["ner_tags"].iloc[0]

array([0., 2., 0., 1., 1.])

In [15]:
df1.sample(5)

Unnamed: 0,ID,tags,text,tokens,ner_tags
1269,NCT02682147,1:9:chronic_disease,diabetes,[diabetes],[2.0]
2394,NCT02931110,65:89:cancer,"Rapidly progressive, clinically unstable central nervous system hematological malignancy","[Rapidly, progressive, ,, clinically, unstable, central, nervous, system, hematological, malignancy]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 3.0]"
1690,NCT02422641,",,,67:83:cancer",Alkaline phosphatase <2.5x the ULN or <5x the ULN if secondary to liver metastasis,"[Alkaline, phosphatase, <, 2.5x, the, ULN, or, <, 5x, the, ULN, if, secondary, to, liver, metastasis]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 3.0]"
1296,NCT01803451,25:33:chronic_disease,healthy control without diabetes,"[healthy, control, without, diabetes]","[0.0, 0.0, 0.0, 2.0]"
2655,NCT02577406,39:76:cancer,Subject has or is suspected of having central nervous system (CNS) leukemia,"[Subject, has, or, is, suspected, of, having, central, nervous, system, (, CNS, ), leukemia]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 3.0, 3.0, 3.0, 3.0, 0.0, 0.0]"


In [16]:
# convert list to array for labels and tokens
df1["tokens"] = df1["tokens"].apply(np.array)
df2["tokens"] = df2["tokens"].apply(np.array)
df3["tokens"] = df3["tokens"].apply(np.array)

df1["ner_tags"] = df1["ner_tags"].apply(np.array)
df2["ner_tags"] = df2["ner_tags"].apply(np.array)
df3["ner_tags"] = df3["ner_tags"].apply(np.array)

In [17]:
df1.to_csv("processed_data/G1.csv", index=False)
df2.to_csv("processed_data/G2.csv", index=False)
df3.to_csv("processed_data/G3.csv", index=False)

In [18]:
# df1 = pd.read_csv("processed_data/G1.csv")
# df2 = pd.read_csv("processed_data/G2.csv")
# df3 = pd.read_csv("processed_data/G3.csv")

# add new feature dataset_id
df1["dataset_num"] = 1
df2["dataset_num"] = 2
df3["dataset_num"] = 3

In [19]:
df1["tokens"].iloc[5]

array(['For', 'subjects', 'with', 'elevated', 'liver', 'tests', 'as',
       'defined', 'above', ',', 'local', 'pathology', 'reading', 'of',
       'liver', 'biopsy', '6-10', 'days', 'after', 'darTregs', 'infusion',
       'is', 'without', 'AR', 'according', 'to', 'Banff', 'criteria'],
      dtype='<U9')

In [20]:
df1["ner_tags"].iloc[5]

array([0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
# def rename_cols(df):
#     df.rename(columns={"ID": "id", "labels": "ner_tags"}, inplace=True)
#     return df

# df1 = rename_cols(df1)
# df2 = rename_cols(df2)
# df3 = rename_cols(df3)

In [22]:
custom_dataset = pd.concat([df1, df2, df3], ignore_index=True)

In [23]:
custom_dataset.dataset_num.value_counts()

dataset_num
1    7223
2    6348
3    6138
Name: count, dtype: int64

# Saving data to dataset format

In [26]:
huggingface_dataset = {
    "ID": custom_dataset["ID"],
    "tags": custom_dataset["tags"],
    "text": custom_dataset["text"],
    "dataset_num" : custom_dataset["dataset_num"],
    "tokens": custom_dataset["tokens"],
    "ner_tags": custom_dataset["ner_tags"],
}

# Create a Hugging Face Dataset object
dataset = Dataset.from_dict(huggingface_dataset)


In [27]:
dataset.save_to_disk("custom_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/19709 [00:00<?, ? examples/s]

# Pushing the dataset to huggingface

In [28]:
import sys
import os

if 'kaggle_web_client' in sys.modules:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    HUGGINGFACE_API_KEY = user_secrets.get_secret("HUGGINGFACE_API_KEY")
elif 'google.colab' in sys.modules:
    !pip -q install python-dotenv
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

else:
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

In [29]:
from huggingface_hub import notebook_login
from huggingface_hub import login
login(token=HUGGINGFACE_API_KEY, write_permission=True)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/shailja/.cache/huggingface/token
Login successful


In [30]:
dataset

Dataset({
    features: ['ID', 'tags', 'text', 'dataset_num', 'tokens', 'ner_tags'],
    num_rows: 19709
})

In [31]:
dataset.push_to_hub("SKT27182/NER")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/391 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
