<a href="https://colab.research.google.com/github/NiyaziOnurYantira/FineTuning/blob/main/meta-llama/Llama_3_1_8B_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip installs

!pip install -q datasets peft requests torch bitsandbytes transformers trl accelerate sentencepiece matplotlib

In [None]:
# imports

import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import torch.nn.functional as F
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime
from peft import PeftModel
import matplotlib.pyplot as plt

In [None]:
# Constants

BASE_MODEL = "meta-llama/Llama-3.1-8B"
PROJECT_NAME = "llama-medical-bot"
HF_USER = "OnurYantira" # your HF name here! Or use mine if you just want to reproduce my results.

# The run itself

RUN_NAME = "2025-04-04_11.07.25"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
REVISION = None # or REVISION = None
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"



# Data

DATASET_NAME = f"antareepdey/Medical_chat_Llama-chat-template"

# Hyperparameters for QLoRA

QUANT_4_BIT = True

%matplotlib inline



In [None]:
# Log in to HuggingFace

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']

In [None]:
train = train.shuffle(seed=42).select(range(5000))
test_indices = train[4000:5000]  # İlk 1000 satır test olacak
train_indices = train[:4000]

# Test ve eğitim veri kümesini belirleyecek indeksleri oluştur
test_indices = list(range(4000, 5000))
train_indices = list(range(4000))  # 0'dan 3999'a kadar olan indeksler

# Yeni veri kümelerini oluştur
test = train.select(test_indices)
train = train.select(train_indices)

# Sonuçları kontrol et
print(f"Yeni Train Dataset Boyutu: {len(train)}")
print(f"Yeni Test Dataset Boyutu: {len(test)}")

In [None]:
test[0]

In [None]:
# pick the right quantization (thank you Robert M. for spotting the bug with the 8 bit version!)

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [None]:
QUANT_4_BIT = True

In [None]:
quant_config

In [None]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

# Load the fine-tuned model with PEFT
if REVISION:
  fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)
else:
  fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)


print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
fine_tuned_model

In [None]:
fine_tuned_model = fine_tuned_model.to("cuda")


In [None]:
system_prompt = "You are a medical chat bot. Keep your answers clear and concise. Respond in English."
user_prompt = "What should be the sleep routine for newborn babies?"
full_prompt = f"{system_prompt}\n{user_prompt}"


inputs = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

output_ids = fine_tuned_model.generate(
    input_ids,
    attention_mask=attention_mask,  # Attention mask'i burada ekledik
    max_length=150,
    temperature=0.7,
    do_sample=True,
    top_k=50,
    top_p=0.95,
)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)

In [None]:
# Chat mesajları (elle prompt olarak yazılacak)
system_prompt = "You are a medical chat bot. Keep your answers clear and concise. Respond in English."
user_prompt = "What should be the sleep routine for newborn babies?"

# LLaMA 3.1'e uygun prompt formatı:
prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}\n<|start_header_id|>user<|end_header_id|>\n{user_prompt}\n<|start_header_id|>assistant<|end_header_id|>\n"

# Tokenization
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate
outputs = fine_tuned_model.generate(
    **inputs,
    max_new_tokens=712,
    temperature=0.7,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    eos_token_id=tokenizer.eos_token_id
)

# Cevabı ayıkla
output_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
response = output_text.split("<|start_header_id|>assistant<|end_header_id|>\n")[-1]
response = response.split("<|end_of_text|>")[0].strip()
print(response)

In [None]:
print(tokenizer.eos_token_id)
print(tokenizer.decode([tokenizer.eos_token_id]))