<a href="https://colab.research.google.com/github/UdaraChamidu/Large-Language-Models/blob/main/Fine_Tuned_bioGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Install Required Libraries

In [None]:
pip install transformers datasets accelerate peft

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

# Step 2: Load Dataset (.jsonl)

In [None]:
from datasets import Dataset
import json

# Load your JSONL file
file_path = "/content/cleaned_dataset.jsonl"

# Step 1: Read the lines into a list of dicts
with open(file_path, "r", encoding="utf-8") as f:
    data_lines = [json.loads(line) for line in f]

# Step 2: Convert to Hugging Face Dataset
dataset = Dataset.from_list(data_lines)

# Step 3: Format the dataset correctly using the right keys
def format_example(example):
    return {
        "text": f"### Question:\n{example['prompt']}\n\n### Answer:\n{example['completion']}"
    }

# Step 4: Apply formatting
dataset = dataset.map(format_example)


Map:   0%|          | 0/16591 [00:00<?, ? examples/s]

In [None]:
print(dataset[0]["text"])

### Question:
Given your profession as an ophthalmologist, please provide a detailed and comprehensive response to the Question:.For the past week or so I have been excessively rubbing my left eye for some reason. I think it’s a nervous habit. I don’t know how many times I’ve done it but it’s quite a bit. But I am now worried that the rubbing I have done is enough to cause keratoconus (a cornea that becomes thin and misshapened over time). Is it possible that I can get this  the rubbing I have done?


### Answer:
Eye rubbing is a risk factor for development of keratoconus. Certain individuals (for example younger people and those with connective tissue disorders such as Ehlers-Danlos syndrome and osteogenesis imperfecta) are more susceptible to the effects of eye rubbing. You should see your ophthalmologist to see if you are at risk for keratoconus and if the eye rubbing is due to an underlying eye disease such as atopic or allergic conjunctivitis (swelling and inflammation of the whit

# Step 3: Load the Pretrained BioGPT Model

In [None]:
!pip install -q sacremoses


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import BioGptTokenizer, BioGptForCausalLM

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
# model_name = "microsoft/biogpt"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/696k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

# Step 4: Tokenize the Data

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=512)

tokenized_data = dataset.map(tokenize, batched=True, remove_columns=['prompt', 'completion', 'text'])


Map:   0%|          | 0/16591 [00:00<?, ? examples/s]

# Step 5: Define Training Arguments

In [None]:
!pip uninstall -y transformers
!pip install transformers --upgrade


Found existing installation: transformers 4.52.4
Uninstalling transformers-4.52.4:
  Successfully uninstalled transformers-4.52.4
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
Successfully installed transformers-4.52.4


In [None]:
import transformers
print(transformers.__version__)


4.52.4


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./biogpt-finetuned",
    #evaluation_strategy="no",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)


# Step 6: Initialize Trainer and Train

In [None]:
from transformers import BioGptTokenizer

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")


In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


NameError: name 'tokenizer' is not defined

# Step 7: Save and Use Fine-Tuned Model

In [None]:
trainer.save_model("./biogpt-finetuned")
tokenizer.save_pretrained("./biogpt-finetuned")


NameError: name 'trainer' is not defined

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="./biogpt-finetuned", tokenizer="./biogpt-finetuned")
pipe("### Question:\nWhat is cataract?\n\n### Answer:\n", max_new_tokens=100)


# Download the Fine-tuned model

In [None]:
!zip -r biogpt-finetuned.zip ./biogpt-finetuned

In [None]:
from google.colab import files
files.download("biogpt-finetuned.zip")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Use the downloaded model anywhere

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "path_to/biogpt-finetuned"  # unzip if needed

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Push to hugging Face

In [None]:
!pip install -q huggingface_hub


In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
repo_name = "biogpt-finetuned-eye-disease"  # You can change this


In [None]:
from huggingface_hub import HfApi, create_repo, upload_folder
from transformers import AutoTokenizer, AutoModelForCausalLM

# Push directory
model_dir = "./biogpt-finetuned"

# Create repo (if not exists)
create_repo(repo_name, exist_ok=True)

# Upload the whole folder
upload_folder(
    folder_path=model_dir,
    repo_id=f"{your_username}/{repo_name}",  # Replace with your Hugging Face username
    commit_message="Upload fine-tuned BioGPT model"
)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


# Load my model from Hugging Face

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("your_username/biogpt-finetuned-eye-disease")
model = AutoModelForCausalLM.from_pretrained("your_username/biogpt-finetuned-eye-disease")
