# Installing Library

In [1]:
!pip install transformers datasets evaluate torch torchtext sentencepiece pandas tqdm

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m22.1 MB/s[0m e

# Loading Data & Library

In [2]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [3]:
# If you have an NVIDIA GPU attached, use 'cuda'
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')

device

device(type='cuda')

In [4]:
ds = load_dataset("QuyenAnhDE/Diseases_Symptoms")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/381 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Diseases_Symptoms.csv:   0%|          | 0.00/107k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

# Prepare Dataseet

In [5]:
updated_data = [{'Name': item['Name'], 'Symptoms': item['Symptoms']} for item in ds['train']]
data = pd.DataFrame(updated_data)
data.head()

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [6]:
data['Symptoms'][0]

'Palpitations, Sweating, Trembling, Shortness of breath, Fear of losing control, Dizziness'

In [7]:
data['Symptoms'] = data['Symptoms'].apply(lambda x: ', '.join(x.split(', ')))
data['Symptoms'][0]

'Palpitations, Sweating, Trembling, Shortness of breath, Fear of losing control, Dizziness'

# Loading Tokenizer & model

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# Prepare Dataset For Model

In [10]:
batch_size = 8
max_length = 128

In [11]:
data.describe()

Unnamed: 0,Name,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


In [12]:
# DataSet Preperation

class LanguageDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data.to_dict(orient='records')
        self.labels = data.columns
        self.tokenizer = tokenizer
        x = self.fittest_max_length(data)
        self.max_length = x


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]

        text = f'{x} | {y}'
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, data):
        max_length = max(len(max(data[self.labels[0]], key=len)), len(max(data[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x * 2
        return x

In [13]:
datasample = LanguageDataset(data, tokenizer)
datasample

<__main__.LanguageDataset at 0x7930f07b9870>

In [14]:
train_size = int(0.8 * len(datasample))
test_size = len(datasample) - train_size


train_data, val_data = random_split(datasample, [train_size, test_size])

In [15]:
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

# Build Arguments

In [16]:
num_epochs = 8
batch_size = 8
model_name = 'distilgpt2'
gpu = 0

In [17]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)

tokenizer.pad_token = tokenizer.eos_token

In [18]:
result = pd.DataFrame(columns=['epochs', 'transformer', 'batch_size', 'gpu', 'training_loss', 'validation_loss', 'epoch_duration_second'])

# Training

In [19]:
import time

In [20]:
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f'Training epoch {epoch+1}/{num_epochs} Batch size = {batch_size} Transformer = {model_name}')

    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        train_iterator.set_postfix({'Training loss': loss.item()})
        epoch_training_loss += loss.item()

    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    model.eval()
    model_validation_loss = 0.0
    total_loss = 0
    validation_iterators = tqdm(val_loader, desc=f'Validation epoch {epoch+1}/{num_epochs} Batch size = {batch_size} Transformer = {model_name}')
    with torch.no_grad():
        for batch in validation_iterators:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss
            validation_iterators.set_postfix({'Validation loss': loss.item()})
            model_validation_loss += loss.item()

        avg_epoch_validation_loss = model_validation_loss / len(validation_iterators)

    end_time = time.time()
    epoch_duration_sec = end_time - start_time

    new_row = {
      'transformer': model_name,
      'batch_size': batch_size,
      'gpu': gpu,
      'epoch': epoch + 1,
      'training_loss': avg_epoch_training_loss,
      'validation_loss': avg_epoch_validation_loss,
      'epoch_duration_second': epoch_duration_sec
    }

    result.loc[len(result)] = new_row
    print(f'Epochs: {epoch + 1}, Validation Loss: {total_loss / len(val_loader)}')

Training epoch 1/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:09<00:00,  4.39it/s, Training loss=0.615]
Validation epoch 1/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 17.83it/s, Validation loss=0.763]


Epochs: 1, Validation Loss: 0.6688078045845032


Training epoch 2/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.31it/s, Training loss=0.509]
Validation epoch 2/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 18.05it/s, Validation loss=0.722]


Epochs: 2, Validation Loss: 0.6443201899528503


Training epoch 3/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:07<00:00,  5.08it/s, Training loss=0.391]
Validation epoch 3/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 16.60it/s, Validation loss=0.735]


Epochs: 3, Validation Loss: 0.6547378897666931


Training epoch 4/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s, Training loss=0.393]
Validation epoch 4/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 15.20it/s, Validation loss=0.778]


Epochs: 4, Validation Loss: 0.6939088702201843


Training epoch 5/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.93it/s, Training loss=0.381]
Validation epoch 5/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 16.74it/s, Validation loss=0.855]


Epochs: 5, Validation Loss: 0.7425814867019653


Training epoch 6/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.91it/s, Training loss=0.22]
Validation epoch 6/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 16.55it/s, Validation loss=0.908]


Epochs: 6, Validation Loss: 0.7908487319946289


Training epoch 7/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.84it/s, Training loss=0.195]
Validation epoch 7/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 16.86it/s, Validation loss=0.935]


Epochs: 7, Validation Loss: 0.8489842414855957


Training epoch 8/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.94it/s, Training loss=0.121]
Validation epoch 8/8 Batch size = 8 Transformer = distilgpt2: 100%|██████████| 10/10 [00:00<00:00, 17.10it/s, Validation loss=1]

Epochs: 8, Validation Loss: 0.8783463835716248





# Validation And Predicted Data

In [21]:
input_str = "Kidney Failure"
input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)

output = model.generate(
    input_ids,
    max_length=20,
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.5,
    repetition_penalty=1.2
)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Kidney Failure | Decreased urine output, fluid retention, fatigue


In [22]:
torch.save(model, 'SmallMedLM.pt')