# imports

In [None]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk.tokenize import word_tokenize

from datasets import Dataset

pd.set_option('display.max_colwidth', None)

# data_load

In [None]:
df1 = pd.read_excel("Continual_Learning/G1.xlsx", index_col=0)
df2 = pd.read_excel("Continual_Learning/G2.xlsx", index_col=0)
df3 = pd.read_excel("Continual_Learning/G3.xlsx", index_col=0)

In [None]:
# Dropping any rows with NaN values
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()

In [None]:
df1[["tags", "text"]].head()

# Preprocessing

- Creating the following tagging scheme for the NER task:


| Entity_name | Token |
| --- | --- |
| Other | 0 |
| treatment | 1 |
| chronic_disease | 2 |
| cancer | 3 |
| allergy_name | 4 |

In [None]:
entity_ids = {
    "treatment": 1,
    "chronic_disease": 2,
    "cancer": 3,
    "allergy_name": 4,
    }
    

In [None]:
def find_word_index(txt, word):
    pattern = re.compile(r'\b{}\b'.format(re.escape(word)))

    # Find the index of the element containing the pattern
    word_index = next((index for index, element in enumerate(txt) if pattern.search(element)), None)

    return word_index

def get_ner_tokens(row):

    # Few tags have leading and trailing commas, removing them
    tag = row.tags.strip(",").strip()    # start:end:name, start:end:name, ... (start and end are in character level)

    # removing leading and trailing whitespace
    txt = row.text


    try:
        # txt = txt.split()
        original_txt = txt
        txt = word_tokenize(row["text"])
    except:
        # print(tag, txt)
        return None, None

    

    # labeled every word as other
    labels = np.zeros(len(txt))

    # iterate over all tages and mark them with their token
    for t in tag.split(","):
        if t == "":
            continue
        start, end, name = t.split(":")

        # as first character is considered as 1 in the dataset, but in python it is 0
        start, end = int(start), int(end)
        start -= 1
        end -= 1

        exact_word = original_txt[start:end]

        n_exact_words = len(exact_word.split())

        # check if word is more than one word, if yes then get the index of the first word and save total number of words
        if n_exact_words > 1:

            exact_word = exact_word.split()[0]

            word_index = find_word_index(txt, exact_word)
            # word_index = txt.index(exact_word)
            try:
                for i in range(word_index, word_index+n_exact_words):
                    labels[i] = entity_ids[name]
            except:
                # print(txt, exact_word, word_index, n_exact_words)
                return None, None

        else:
            

            word_index = find_word_index(txt, exact_word)

            labels[word_index] = entity_ids[name]

    return txt, labels



In [None]:
df1["tokens"], df1["ner_tags"] = zip(*df1.apply(get_ner_tokens, axis=1))
df2["tokens"], df2["ner_tags"] = zip(*df2.apply(get_ner_tokens, axis=1))
df3["tokens"], df3["ner_tags"] = zip(*df3.apply(get_ner_tokens, axis=1))

In [None]:
df1.sample(5)

In [None]:
df1.iloc[1177]

In [None]:
tokens, tags  =zip(*pd.DataFrame(df1.iloc[1177]).T.apply(get_ner_tokens, axis=1))
tokens, tags 

In [None]:
# Drop rows with None values
df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

In [None]:
df1["tokens"].iloc[0]

In [None]:
df1["ner_tags"].iloc[0]

In [None]:
df1.sample(5)

In [None]:
# convert list to array for labels and tokens
df1["tokens"] = df1["tokens"].apply(np.array)
df2["tokens"] = df2["tokens"].apply(np.array)
df3["tokens"] = df3["tokens"].apply(np.array)

df1["ner_tags"] = df1["ner_tags"].apply(np.array)
df2["ner_tags"] = df2["ner_tags"].apply(np.array)
df3["ner_tags"] = df3["ner_tags"].apply(np.array)

In [None]:
df1.to_csv("processed_data/G1.csv", index=False)
df2.to_csv("processed_data/G2.csv", index=False)
df3.to_csv("processed_data/G3.csv", index=False)

In [None]:
# df1 = pd.read_csv("processed_data/G1.csv")
# df2 = pd.read_csv("processed_data/G2.csv")
# df3 = pd.read_csv("processed_data/G3.csv")

# add new feature dataset_id
df1["dataset_num"] = 1
df2["dataset_num"] = 2
df3["dataset_num"] = 3

In [None]:
df1["tokens"].iloc[5]

In [None]:
df1["ner_tags"].iloc[5]

In [None]:
def rename_cols(df):
    df.rename(columns={"ID": "id"}, inplace=True)
    return df

df1 = rename_cols(df1)
df2 = rename_cols(df2)
df3 = rename_cols(df3)

In [None]:
custom_dataset = pd.concat([df1, df2, df3], ignore_index=True)

In [None]:
custom_dataset.dataset_num.value_counts()

# Saving data to dataset format

In [None]:
huggingface_dataset = {
    "id": custom_dataset["id"],
    "tags": custom_dataset["tags"],
    "text": custom_dataset["text"],
    "dataset_num" : custom_dataset["dataset_num"],
    "tokens": custom_dataset["tokens"],
    "ner_tags": custom_dataset["ner_tags"],
}

# Create a Hugging Face Dataset object
dataset = Dataset.from_dict(huggingface_dataset)


In [None]:
dataset

## train-test split

In [None]:
from datasets import DatasetDict#, train_test_split

# Assuming your original DatasetDict is called original_dataset
original_train_data = dataset

# Get unique dataset_num values
unique_dataset_nums = [1,2,3]#original_train_data["dataset_num"].unique()

# Initialize empty datasets for train and test
combined_data = {"train": [], "test": []}

# Split each dataset_num into train and test
for dataset_num in unique_dataset_nums:
    subset_data = original_train_data.filter(lambda example: example["dataset_num"] == dataset_num)

    print(subset_data)
    # Split the subset into train and test using datasets.train_test_split
    splited_subset = subset_data.train_test_split( test_size=0.2, seed=42)

    # merged the respective train and test data to the combined_data
    combined_data["train"].append(splited_subset["train"])
    combined_data["test"].append(splited_subset["test"])
    


In [None]:
from datasets import load_dataset, concatenate_datasets

# Concatenate all train data
combined_data["train"] = concatenate_datasets(combined_data["train"])
combined_data["test"] = concatenate_datasets(combined_data["test"])

# combined_data = DatasetDict({"train": train_data, "test": test_data})

In [None]:
combined_data = DatasetDict(combined_data)

In [None]:
combined_data.save_to_disk("custom_dataset")

# Pushing the dataset to huggingface

In [None]:
import sys
import os

if 'kaggle_web_client' in sys.modules:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    HUGGINGFACE_API_KEY = user_secrets.get_secret("HUGGINGFACE_API_KEY")
elif 'google.colab' in sys.modules:
    !pip -q install python-dotenv
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

else:
    from dotenv import load_dotenv
    load_dotenv()
    HUGGINGFACE_API_KEY = os.getenv('HUGGINGFACE_API_KEY')

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import login
login(token=HUGGINGFACE_API_KEY, write_permission=True)

In [None]:
combined_data

In [None]:
combined_data.push_to_hub("SKT27182/NER_processed_data")

In [None]:
dict = {"name":[]}

In [None]:
dict["name"].extend(["saurabh", "ramesh"])

In [None]:
dict