In [1]:

# !pip install -Uqqq pip --progress-bar off
# !pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq transformers==4.32.1 --progress-bar off
!pip install -qqq datasets==2.14.4 --progress-bar off
# !pip install -qqq peft==0.5.0 --progress-bar off
# !pip install -qqq bitsandbytes==0.41.1 --progress-bar off
# !pip install -qqq trl==0.7.1 --progress-bar off

In [5]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

In [8]:
from datasets import load_dataset

dataset = load_dataset(path = "SagarKeshave/dr_data", data_files="DR_PROFILE_3.json")
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Profile', 'Name'],
        num_rows: 1601
    })
})

In [11]:
import pandas as pd

filename = "/content/DR_PROFILE_3.json"


dataset_df = pd.read_json(filename)
examples = dataset_df.to_dict()

In [12]:
examples["Name"][0]

'Rachel V Aaron, Ph.D., M.A.'

In [14]:
dataset_df.head(2)

Unnamed: 0,Name,Profile
0,"Rachel V Aaron, Ph.D., M.A.","Member, Social Media Committee, Society for Af..."
1,"Peter Magdy Abadir, M.D.","The Gerontological Society of America (GSA), 2..."


In [15]:
len(dataset_df)

1601

In [20]:
if "Name" in examples and "Profile" in examples:
  text = examples["Name"][0] + " " + examples["Profile"][0]

In [21]:
text

'Rachel V Aaron, Ph.D., M.A. Member, Social Media Committee, Society for Affective Science, 2018 Adjustment to Chronic Illness, Chronic Pain, Cognitive Behavior Therapy (CBT), Rehabilitation Psychology '

In [22]:


prompt_template = """### Doctor Name:
{name}

### Doctor Profile :"""

num_examples = len(examples["Name"])
finetuning_dataset = []
for i in range(num_examples):
  name = examples["Name"][i]
  profile = examples["Profile"][i]
  text_with_prompt_template = prompt_template.format(name=name)

  finetuning_dataset.append({"Name": text_with_prompt_template, "Profile": profile})



One datapoint in the finetuning dataset:
{'Name': '### Doctor Name:\n'
         'Rachel V Aaron, Ph.D., M.A.\n'
         '\n'
         '### Doctor Profile :',
 'Profile': 'Member, Social Media Committee, Society for Affective Science, '
            '2018 Adjustment to Chronic Illness, Chronic Pain, Cognitive '
            'Behavior Therapy (CBT), Rehabilitation Psychology '}


In [24]:
from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'Name': '### Doctor Name:\n'
         'Rachel V Aaron, Ph.D., M.A.\n'
         '\n'
         '### Doctor Profile :',
 'Profile': 'Member, Social Media Committee, Society for Affective Science, '
            '2018 Adjustment to Chronic Illness, Chronic Pain, Cognitive '
            'Behavior Therapy (CBT), Rehabilitation Psychology '}


In [27]:
tokenizer.pad_token = tokenizer.eos_token

In [28]:
text = finetuning_dataset[0]["Name"] + finetuning_dataset[0]["Profile"]
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

[[ 4118 15058  9424    27   187    51 17470   657 22234    13  1777    15
     37   904   353    15    34    15   187   187  4118 15058 27047  1163
  20226    13  8404 11263  7039    13  8273   323   329   887   422  6875
     13  4765 29702   420   281 28289  6192  1255    13 28289 21869    13
  43419 29475 34294   313    36 11584   582 49251 29624   209]]


In [29]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

In [30]:
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)

In [31]:
tokenized_inputs["input_ids"]

array([[ 4118, 15058,  9424,    27,   187,    51, 17470,   657, 22234,
           13,  1777,    15,    37,   904,   353,    15,    34,    15,
          187,   187,  4118, 15058, 27047,  1163, 20226,    13,  8404,
        11263,  7039,    13,  8273,   323,   329,   887,   422,  6875,
           13,  4765, 29702,   420,   281, 28289,  6192,  1255,    13,
        28289, 21869,    13, 43419, 29475, 34294,   313,    36, 11584,
          582, 49251, 29624,   209]])

### Tokenize the instruction dataset

In [32]:
def tokenize_function(examples):

    if "Name" in examples and "Profile" in examples:
      text = examples["Name"][0] + " " + examples["Profile"][0]

      # text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [37]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Map:   0%|          | 0/1601 [00:00<?, ? examples/s]

Dataset({
    features: ['Profile', 'Name', 'input_ids', 'attention_mask'],
    num_rows: 1601
})


In [38]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [39]:
tokenized_dataset

Dataset({
    features: ['Profile', 'Name', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1601
})

In [40]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['Profile', 'Name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1440
    })
    test: Dataset({
        features: ['Profile', 'Name', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 161
    })
})


In [None]:
# This is how to push your own dataset to your Huggingface hub
!pip install huggingface_hub
!huggingface-cli login


In [44]:
split_dataset.push_to_hub("SagarKeshave/dr_data", token=token)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [55]:
dataset = load_dataset("SagarKeshave/dr_data")

In [53]:
dataset["Profile"][0]

'Lewis E. Braverman Lectureship Award, American Thyroid Association, 2012 Endocrinology, Metabolic Disorders, Thyroid Diseases '

In [56]:
dataset["train"]

Dataset({
    features: ['Profile', 'Name', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1440
})

In [60]:
def process_dataset(data):
    return (
        data.shuffle(seed=42)
        .remove_columns(
            [
                "input_ids",
                "attention_mask",
                "labels",

            ]
        )
    )

In [61]:
process_dataset(dataset["train"])

Dataset({
    features: ['Profile', 'Name'],
    num_rows: 1440
})