<a href="https://colab.research.google.com/github/Shahbaz894/artificial-intelligence/blob/main/FineTunePhiModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install accelerate transformers peft bitsandbytes datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nv

In [6]:
from datasets import load_dataset,Dataset
import torch

In [7]:
from transformers import (
    AutoModelForCausalLM,AutoTokenizer,TrainingArguments,Trainer,BitsAndBytesConfig,DataCollatorForLanguageModeling
)

In [8]:
from peft import LoraConfig,get_peft_model,PeftModel
import pandas as pd


In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import torch

class LoraFineTuner:
    def __init__(self, model_name, dataset_name, output_dir):
        """
        Initialize the fine-tuner with model name, dataset name, and output directory.
        """
        print('Initializing parameters...')
        self.model_name = model_name
        self.dataset_name = dataset_name
        self.output_dir = output_dir
        self.tokenizer = None
        self.model = None
        self.tokenized_data = None

    def load_tokenizer(self):
        """
        Load tokenizer from the pretrained model.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Set pad token

    def load_model(self):
        """
        Load the base model.
        """
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True
        )

    def apply_lora(self):
        """
        Apply LoRA (Low-Rank Adaptation) for fine-tuning the model.
        """
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0.05,
            bias='none',
            task_type="CAUSAL_LM"
        )
        self.model = get_peft_model(self.model, lora_config)
        self.model.print_trainable_parameters()

    def load_and_tokenize_dataset(self):
        """
        Load dataset, check columns, and tokenize it for training.
        """
        data = load_dataset(self.dataset_name,'main', split='train')

        # Check available columns
        columns = data.column_names
        print("Dataset columns:", columns)

        if 'text' not in columns:
            if 'question' in columns and 'answer' in columns:
                data = data.map(lambda x: {'text': x['question'] + ' ' + x['answer']})
            else:
                raise ValueError("Dataset must contain a 'text' column for training.")

        def tokenize(sample):
            return self.tokenizer(sample['text'], padding=True, truncation=True, max_length=512)

        self.tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data")

    def train(self, epochs: int = 1, batch_size: int = 4, learning_rate: float = 2e-4, max_steps: int = 1000):
        """
        Train the model using the provided dataset.
        """
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=1,
            learning_rate=learning_rate,
            lr_scheduler_type="cosine",
            save_strategy="epoch",
            logging_steps=100,
            max_steps=max_steps,
            num_train_epochs=epochs,
            push_to_hub=True,
            report_to="none"
        )

        trainer = Trainer(
            model=self.model,
            train_dataset=self.tokenized_data,
            args=training_args,
            data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        )

        trainer.train()

    def save_model(self, model_repo: str):
        """
        Save the fine-tuned model to the specified repository.
        """
        self.model.push_to_hub(model_repo)
        print("Model saved successfully.")

    def run(self):
        """
        Execute the entire fine-tuning pipeline.
        """
        print("Starting fine-tuning process...")
        self.load_tokenizer()
        print("Tokenizer loaded.")

        self.load_model()
        print("Model loaded.")

        self.apply_lora()
        print("LoRA applied.")

        try:
            self.load_and_tokenize_dataset()
            print("Dataset loaded and tokenized.")
        except ValueError as e:
            print(f"Error loading dataset: {e}")
            return

        self.train()
        print("Model training completed.")

        self.save_model('shahbazzulfqar/my_finetuned_phi_model')
        print("Fine-tuned model saved.")


In [10]:
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

In [11]:
from huggingface_hub import login
login(HF_TOKEN)

In [12]:
model_name="microsoft/phi-1_5"
dataset_name="gsm8k"
output_dir="phi-1_5-finetuned"

In [13]:
dataset_name

'gsm8k'

In [14]:
fine_tuner=LoraFineTuner(model_name,dataset_name,output_dir)

Initializing parameters...


In [15]:
fine_tuner.run()

Starting fine-tuning process...


tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Tokenizer loaded.


config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Model loaded.
trainable params: 3,145,728 || all params: 1,421,416,448 || trainable%: 0.2213
LoRA applied.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset columns: ['question', 'answer']


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Tokenizing data:   0%|          | 0/7473 [00:00<?, ? examples/s]

Dataset loaded and tokenized.


Step,Training Loss
100,1.1783
200,1.0765
300,1.0456
400,1.0525
500,1.0572
600,1.0346
700,0.9887
800,1.0168
900,1.029
1000,1.0301


Model training completed.


adapter_model.safetensors:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

Model saved successfully.
Fine-tuned model saved.


# New section

In [16]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(output_dir)
model.push_to_hub("shahbazzulfqar/my_finetuned_phi_model")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shahbazzulfqar/my_finetuned_phi_model/commit/2f581740581ca384af6fb98d05c283366211a773', commit_message='Upload PhiForCausalLM', commit_description='', oid='2f581740581ca384af6fb98d05c283366211a773', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shahbazzulfqar/my_finetuned_phi_model', endpoint='https://huggingface.co', repo_type='model', repo_id='shahbazzulfqar/my_finetuned_phi_model'), pr_revision=None, pr_num=None)

# New section