In [1]:
!pip install transformers datasets accelerate



In [2]:
from datasets import load_dataset

dataset = load_dataset("avaliev/chat_doctor")

# Input dataset
sample_size = int(0.1 * len(dataset["train"]))
sampled_train = dataset["train"].shuffle(seed=42).select(range(sample_size))

# Validation set
val_dataset = int(0.1 * len(dataset["validation"]))
sampled_val = dataset["validation"].shuffle(seed=42).select(range(val_dataset))

# Create sample datadict with the sample data
sampled_data = dataset.copy()
sampled_data["train"] = sampled_train
sampled_data["validation"] = sampled_val

print(f"sampled tarin set:", len(sampled_data['train']))
print(f"sampled validation set:", len(sampled_data['validation']))


  from .autonotebook import tqdm as notebook_tqdm


sampled tarin set: 9558
sampled validation set: 1194


In [3]:
# preprocess and tokenize the data

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from tqdm.auto import tqdm
import time

for i in tqdm(range(100)):
    time.sleep(0.1)


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})


def preprocess_function(examples):

    combined_text = [
        f"Input: {input_text}\n"
        "Instruction: {instruction}"
        for input_text, instruction in zip(examples['input'], examples['instruction'])]

    # Tokenize the combined text with padding
    tokenized_inputs = tokenizer(
        combined_text,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    #
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()

    return tokenized_inputs

tokenized_dataset = sampled_data.copy()
tokenized_dataset["train"] = sampled_data["train"].map(preprocess_function, batched=True)
tokenized_dataset["validation"] = sampled_data["validation"].map(preprocess_function, batched=True)




100%|██████████| 100/100 [00:10<00:00,  9.42it/s]


In [8]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datetime import datetime

start_time = datetime.now()
print(f"Traing start at: {start_time}")



model = AutoModelForCausalLM.from_pretrained("gpt2-medium")

if tokenizer.pad_token is not None:
  model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./shrey_midbot",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
)

trainer.train(resume_from_checkpoint=r"C:\Users\patel\Downloads\shrey_midbot\checkpoint-7167")



end_time = datetime.now()
print(f"End training at: {end_time}")


traning_time = end_time - start_time
print(f"Total time for trainingis: {traning_time}")

Traing start at: 2025-02-05 17:03:26.834259


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Epoch,Training Loss,Validation Loss


End training at: 2025-02-05 17:03:36.184870
Total time for trainingis: 0:00:09.350611


In [9]:
path = "./fine_tuned_chat_doctor"
trainer.save_model(path)
tokenizer.save_pretrained(path)

('./fine_tuned_chat_doctor\\tokenizer_config.json',
 './fine_tuned_chat_doctor\\special_tokens_map.json',
 './fine_tuned_chat_doctor\\vocab.json',
 './fine_tuned_chat_doctor\\merges.txt',
 './fine_tuned_chat_doctor\\added_tokens.json',
 './fine_tuned_chat_doctor\\tokenizer.json')

In [21]:
import torch
import textwrap

model.eval()  

print("\nEntering interactive mode (type 'quit' to exit):")
while True:
    user_input = input("User: ")
    if user_input.lower() == "quit":
        break

    # Create a prompt similar to your training examples
    prompt = f"Input: {user_input}\n  Instruction: "
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)  
    
    # Generate a response from the model
    generated_ids = model.generate(input_ids, max_length=512, num_beams=5, no_repeat_ngram_size=2)
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    
    wrapped_response = textwrap.fill(response, width=90)
    
    print("Bot:\n", wrapped_response)



Entering interactive mode (type 'quit' to exit):


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:
 Input: feeling dizzy after running    Instruction:  {instruction} smelling salts detected
in the urine of a patient suffering from aortic aneurysm, the patient has been admitted to
the hospital and is being treated for the same. Shall we be concerned about the presence
of these salts in his urine? Instruction: { ankles detected during a routine check-up, two
of the ankles were found to have a small amount of blood in them, however the other two
are normal in size and are not affected by the blood loss, what is the reason for this and
what are the precautions to be taken in order to avoid the recurrence of such a situation?
Shall the two ankles be sent for an x-ray and a blood test, and if so, how long will it
take to get the results back? How can we prevent the possibility of recurrences of this
kind of an event? What should be the next step to take in this regard? Thank you for your
time.  YYYY@YYYY   Kindly type your query here... Shall We Be Concerned About The Scenting
Salts 