In [59]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import (
    Trainer,
    RobertaTokenizerFast,
    RobertaForQuestionAnswering, 
    TrainingArguments, 
)
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

In [60]:
notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load dataset


In [61]:
# Load the dataset
dataset = load_dataset("ruslanmv/ai-medical-chatbot", use_auth_token=True)



In [62]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Description', 'Patient', 'Doctor'],
        num_rows: 256916
    })
})

In [63]:
train_data = dataset["train"]
df = pd.DataFrame(train_data)
df = df[['Description', 'Patient', 'Doctor']]

In [75]:
subset_size = 100_000
dataset_subset = dataset["train"].select(range(subset_size))
train_test_valid = dataset_subset.train_test_split(test_size=0.2)
train_dataset = train_test_valid["train"]
validation_dataset = train_test_valid["test"]


In [76]:
model_checkpoint = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)

In [77]:
def tokenize_function(examples):
    inputs = examples["Description"]
    answers = examples["Doctor"]

    encodings = tokenizer(
        inputs, 
        truncation=True, 
        padding="max_length", 
        max_length=32
    )

    # Menambahkan posisi dummy untuk jawaban
    start_positions = [0] * len(inputs)
    end_positions = [0] * len(inputs)

    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    
    return encodings

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [78]:
model = RobertaForQuestionAnswering.from_pretrained(model_checkpoint)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True 
)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
trainer = Trainer( 
    model=model, 
    args=training_args, 
    train_dataset=tokenized_train_dataset, 
    eval_dataset=tokenized_validation_dataset, 
) 
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0,0.0


TrainOutput(global_step=10000, training_loss=0.00497688834815126, metrics={'train_runtime': 12936.8792, 'train_samples_per_second': 6.184, 'train_steps_per_second': 0.773, 'total_flos': 1306483783680000.0, 'train_loss': 0.00497688834815126, 'epoch': 1.0})

In [80]:
results = trainer.evaluate()
print(results)


{'eval_loss': 1.1930762866541045e-07, 'eval_runtime': 524.1644, 'eval_samples_per_second': 38.156, 'eval_steps_per_second': 4.769, 'epoch': 1.0}


In [94]:
sample_data = {
    "Description": ["What should I do to reduce my weight gained due to genetic hypothyroidism?"],
    "Pasien": ["Hi doctor, I am a 22-year-old female who was diagnosed with hypothyroidism (genetic) when I was 12. Over the past five years, I have become around 50 pounds overweight and all of my attempts to lose have seemed to fail so I have given up, but my weight has stayed the same. There is so much information put there about losing weight with hypothyroidism but it all seems to conflict. I am so unsure as to what type of exercise and diet I should follow as a result but I still would like to lose weight, but most importantly have my body feel better. What can I do? I am currently on Levothyroxine, Buspar, and Benedryl."],
    
}


In [95]:
inputs = tokenizer(
    sample_data["Description"], 
    sample_data["Pasien"], 
    padding="max_length", 
    truncation=True, 
    return_tensors="pt",
    max_length=32  # Mengatur panjang maksimal tokenisasi
)


In [96]:
 # Pindahkan model ke GPU jika tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Pindahkan input ke GPU jika tersedia
inputs = {key: value.to(device) for key, value in inputs.items()}

# Lakukan prediksi
with torch.no_grad():
    outputs = model(**inputs)

# Ambil start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Cari posisi dengan probabilitas tertinggi
start_positions = torch.argmax(start_logits, dim=1)
end_positions = torch.argmax(end_logits, dim=1)


In [97]:
for i in range(len(sample_data["Description"])):
    all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][i])
    answer = tokenizer.convert_tokens_to_string(all_tokens[start_positions[i]:end_positions[i]+1])
    print(f"Pertanyaan: {sample_data['Description'][i]}")
    print(f"Jawaban: {answer}\n")


Pertanyaan: What should I do to reduce my weight gained due to genetic hypothyroidism?
Jawaban: <s>

