# Imports

In [8]:
import pandas as pd 
import torch
from torch.utils.data import Dataset, random_split
from typing import List, Dict, Union
from typing import Any, TypeVar
import pandas as pd
import os
import opendatasets as od

from transformers import AutoTokenizer, TrainingArguments 
from transformers import Trainer, AutoModelForCausalLM, IntervalStrategy

set a seed

In [9]:
torch.manual_seed(2137)

<torch._C.Generator at 0x1ed3b8b1e10>

# Dataset Download

NOTE: you will need a kaggle API key for the following to work

In [4]:
import json

# Path to your JSON file
json_file_path = "kaggle.json"

# Open the file and read the content
try:
  with open(json_file_path, "r") as f:
    json_data = json.load(f)
except FileNotFoundError:
  print(f"Error: JSON file not found at {json_file_path}")
  exit(1)

# Access username and key from the JSON data
try:
  username = json_data["username"]
  key = json_data["key"]
except KeyError:
  print("Error: 'username' or 'key' key not found in JSON data")
  exit(1)

In [6]:
os.environ['KAGGLE_USERNAME'] = username
os.environ['KAGGLE_KEY'] = key

# Assign the Kaggle data set URL into variable
dataset = 'https://www.kaggle.com/datasets/dsxavier/diagnoise-me'
# Using opendatasets let's download the data sets
od.download(dataset, "dataset")

Downloading diagnoise-me.zip to dataset\diagnoise-me


100%|██████████| 191M/191M [00:41<00:00, 4.82MB/s] 





# Load Dataset

In [22]:
DATA_PATH = "dataset\\diagnoise-me\\diagnose_en_dataset.feather"
data = pd.read_feather(DATA_PATH)
print(data.keys())

data = data['Patient'].values

SEQ_LEN: int = 1024
SAMPLE_SIZE: int =  int(data.shape[0] * 0.01) #get 1% of the data
_data = [el[:SEQ_LEN]  for el in data[:SAMPLE_SIZE]]

Index(['id', 'Description', 'Doctor', 'Patient'], dtype='object')


In [16]:
print(_data)

['Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear?', 'Hi doctor, I am a 26 year old male. I am 5 feet and 9 inches tall and weigh 255 pounds. When I eat spicy\xa0food, I poop blood. Sometimes when I have constipation as well, I poop a little bit of blood. I am really scared that I have colon cancer. I do have diarrhea often. I\xa0do not have a\xa0family history of colon\xa0cancer. I got blood tests done last night. Please find my reports attached.', 'Hello doctor, I am 48 years old. I am experiencing weak erection and difficulty in sustaining the same. This condition was observed 10 years back. Also, there is premature ejaculation. Other physical ailments that I have are, I am suffering from hypertension and taking Amlopres-L (Amlodipine and Lisinopril) for the last 10 years, high cholesterol and triglycerides. My cholesterol level is 225 and triglyceride is 2

In [23]:
class PatientDiagnozeDataset(Dataset):
    
    def __init__(self, txt_list, tokenizer, max_length):
        
        self.input_ids: List = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer(BOS_TOKEN + txt + EOS_TOKEN, truncation=True, 
                                      max_length = max_length, padding = "max_length")
            self.input_ids.append(torch.tensor(encodings_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encodings_dict["attention_mask"]))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [27]:
dataset = PatientDiagnozeDataset(txt_list = _data, tokenizer = tokenizer, max_length = 1024)
TRAIN_SIZE: int = int(len(dataset) * 0.8)
train_dataset, val_dataset = random_split(dataset, [TRAIN_SIZE, len(dataset) - TRAIN_SIZE])

In [28]:
os.makedirs('./results', exist_ok = True)
OUTPUT_DIR: str = './results'

# Model

In [25]:
# tokens for the datset
MODEL_NAME: str = 'EleutherAI/gpt-neo-125M'
BOS_TOKEN: str = '<|startoftext|>'
EOS_TOKEN: str = '<|endoftext|>'
PAD_TOKEN: str = '<|pad|>'

## Tokenizer

In [30]:
# Load tokenizer 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, bos_token = BOS_TOKEN, 
                                         eos_token=EOS_TOKEN, pad_token=PAD_TOKEN)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
model =  AutoModelForCausalLM.from_pretrained(MODEL_NAME).cuda()
model.resize_token_embeddings(len(tokenizer))

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Embedding(50259, 768)

In [29]:
training_args = TrainingArguments(
    output_dir = OUTPUT_DIR, 
    num_train_epochs = 2, 
    logging_steps = 5000, 
    save_strategy="epoch",
    save_total_limit = 1,
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2, 
    warmup_steps=50, 
    weight_decay=0.01, 
    logging_dir='./logs', 
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    report_to=['tensorboard']
    )

# Training

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
    data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}
)

In [None]:
trainer.train()