In [1]:
!pip install torch torchtext transformers sentencepiece pandas tqdm datasets

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)


In [2]:
from datasets import load_dataset
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_scheduler
from tqdm import tqdm
import time

In [3]:
data = load_dataset("QuyenAnhDE/Diseases_Symptoms")
df = pd.DataFrame([{'Name': item['Name'], 'Symptoms': item['Symptoms']} for item in data['train']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/381 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Diseases_Symptoms.csv:   0%|          | 0.00/107k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

In [4]:
df['Symptoms'] = df['Symptoms'].apply(
    lambda x: ', '.join([s.strip().capitalize() for s in x.split(',')])
)

In [5]:
df['prompt'] = df.apply(lambda row: f"Symptoms of {row['Name']} are: {row['Symptoms']}.", axis=1)

In [6]:
#Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

In [7]:
# Model
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
# Dataset Preparation
class MedicalDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.samples = df['prompt'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.samples[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids.clone()
        }

In [9]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

In [10]:
# Train-Validation Split

dataset = MedicalDataset(df, tokenizer)
train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size
train_data, valid_data = random_split(dataset, [train_size, valid_size])

train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=8, collate_fn=collate_fn)

In [11]:
# Train Setup

optimizer = optim.AdamW(model.parameters(), lr=5e-4)
scheduler = get_scheduler("linear", optimizer=optimizer,
                          num_warmup_steps=50,
                          num_training_steps=len(train_loader) * 10)

In [12]:
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, label_smoothing=0.1)

In [13]:
# Training loop
results = []

for epoch in range(10):
    start_time = time.time()
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    #Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc=f"Epoch {epoch+1} Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(valid_loader)
    duration = time.time() - start_time

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Time: {duration:.2f}s")
    results.append({
        'epoch': epoch+1,
        'train_loss': avg_train_loss,
        'val_loss': avg_val_loss,
        'duration_sec': duration
    })

Epoch 1 Training:   0%|          | 0/40 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1 Training: 100%|██████████| 40/40 [00:08<00:00,  4.65it/s]
Epoch 1 Validation: 100%|██████████| 10/10 [00:00<00:00, 19.59it/s]


Epoch 1 | Train Loss: 1.7302 | Val Loss: 0.7130 | Time: 9.13s


Epoch 2 Training: 100%|██████████| 40/40 [00:07<00:00,  5.58it/s]
Epoch 2 Validation: 100%|██████████| 10/10 [00:00<00:00, 19.38it/s]


Epoch 2 | Train Loss: 0.6054 | Val Loss: 0.6649 | Time: 7.69s


Epoch 3 Training: 100%|██████████| 40/40 [00:07<00:00,  5.42it/s]
Epoch 3 Validation: 100%|██████████| 10/10 [00:00<00:00, 18.61it/s]


Epoch 3 | Train Loss: 0.4798 | Val Loss: 0.6630 | Time: 7.93s


Epoch 4 Training: 100%|██████████| 40/40 [00:07<00:00,  5.45it/s]
Epoch 4 Validation: 100%|██████████| 10/10 [00:00<00:00, 19.14it/s]


Epoch 4 | Train Loss: 0.3918 | Val Loss: 0.6810 | Time: 7.87s


Epoch 5 Training: 100%|██████████| 40/40 [00:07<00:00,  5.41it/s]
Epoch 5 Validation: 100%|██████████| 10/10 [00:00<00:00, 18.87it/s]


Epoch 5 | Train Loss: 0.3310 | Val Loss: 0.7083 | Time: 7.93s


Epoch 6 Training: 100%|██████████| 40/40 [00:07<00:00,  5.37it/s]
Epoch 6 Validation: 100%|██████████| 10/10 [00:00<00:00, 18.22it/s]


Epoch 6 | Train Loss: 0.2853 | Val Loss: 0.7365 | Time: 8.00s


Epoch 7 Training: 100%|██████████| 40/40 [00:07<00:00,  5.23it/s]
Epoch 7 Validation: 100%|██████████| 10/10 [00:00<00:00, 17.86it/s]


Epoch 7 | Train Loss: 0.2446 | Val Loss: 0.7513 | Time: 8.23s


Epoch 8 Training: 100%|██████████| 40/40 [00:08<00:00,  4.95it/s]
Epoch 8 Validation: 100%|██████████| 10/10 [00:00<00:00, 17.55it/s]


Epoch 8 | Train Loss: 0.2112 | Val Loss: 0.7901 | Time: 8.67s


Epoch 9 Training: 100%|██████████| 40/40 [00:07<00:00,  5.06it/s]
Epoch 9 Validation: 100%|██████████| 10/10 [00:00<00:00, 16.71it/s]


Epoch 9 | Train Loss: 0.1860 | Val Loss: 0.8098 | Time: 8.51s


Epoch 10 Training: 100%|██████████| 40/40 [00:08<00:00,  4.93it/s]
Epoch 10 Validation: 100%|██████████| 10/10 [00:00<00:00, 17.10it/s]

Epoch 10 | Train Loss: 0.1706 | Val Loss: 0.8277 | Time: 8.72s





In [14]:
# Inference

def generate_symptoms(disease_name):
    prompt = f"Symptoms of {disease_name} are:"
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    output = model.generate(
        input_ids,
        max_length=50,
        num_beams=5,
        early_stopping=True,
        repetition_penalty=1.1
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [19]:
print(generate_symptoms("Ethylene glycol poisoning-1"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Symptoms of Ethylene glycol poisoning-1 are: Nausea, Vomiting, Abdominal pain, Liver or kidney damage, Organ dysfunction.


In [20]:
model.save_pretrained('./Gpt2Medi')
tokenizer.save_pretrained('./Gpt2Medi')

('./Gpt2Medi/tokenizer_config.json',
 './Gpt2Medi/special_tokens_map.json',
 './Gpt2Medi/vocab.json',
 './Gpt2Medi/merges.txt',
 './Gpt2Medi/added_tokens.json')

In [22]:
!zip -r Gpt2Medi.zip /content/Gpt2Medi

  adding: content/Gpt2Medi/ (stored 0%)
  adding: content/Gpt2Medi/generation_config.json (deflated 24%)
  adding: content/Gpt2Medi/vocab.json (deflated 68%)
  adding: content/Gpt2Medi/special_tokens_map.json (deflated 74%)
  adding: content/Gpt2Medi/model.safetensors (deflated 7%)
  adding: content/Gpt2Medi/merges.txt (deflated 53%)
  adding: content/Gpt2Medi/tokenizer_config.json (deflated 56%)
  adding: content/Gpt2Medi/config.json (deflated 52%)
