In [1]:
!pip install torch transformers peft bitsandbytes datasets evaluate


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from 

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
!unzip "/content/drive/My Drive/e-doctor-qlora-adapter.zip" -d /content/


Archive:  /content/drive/My Drive/e-doctor-qlora-adapter.zip
   creating: /content/e-doctor-qlora-adapter/
  inflating: /content/e-doctor-qlora-adapter/tokenizer.json  
  inflating: /content/e-doctor-qlora-adapter/chat_template.jinja  
  inflating: /content/e-doctor-qlora-adapter/adapter_model.safetensors  
  inflating: /content/e-doctor-qlora-adapter/adapter_config.json  
  inflating: /content/e-doctor-qlora-adapter/tokenizer_config.json  
  inflating: /content/e-doctor-qlora-adapter/special_tokens_map.json  
  inflating: /content/e-doctor-qlora-adapter/README.md  


In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

base_model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
adapter_path = "/content/e-doctor-qlora-adapter"

tokenizer = AutoTokenizer.from_pretrained(adapter_path)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval()


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/7.39G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [17]:
import json
json_path = '/content/test___qlora.json'
with open(json_path, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

In [9]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9243268ce1bdfa8c0e673c21087ecca57ac1046c26dcdd0b92cc8ba7b39c48e7
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [18]:
import evaluate

rouge = evaluate.load('rouge')
predictions = []
references = []

for example in test_data:
    prompt = example['input']
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False
        )
    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predictions.append(pred)
    references.append(example['output'])

rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE:", rouge_results)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more de

ROUGE: {'rouge1': np.float64(0.2019331541771915), 'rouge2': np.float64(0.04892896930273352), 'rougeL': np.float64(0.1179159215045833), 'rougeLsum': np.float64(0.1318675400391395)}


In [19]:
import math

def calculate_ppl(model, tokenizer, text):
    encodings = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings.input_ids)
        loss = outputs.loss
    return math.exp(loss.item())

ppl_scores = []
for example in test_data:
    ppl = calculate_ppl(model, tokenizer, example["output"])
    ppl_scores.append(ppl)
avg_ppl = sum(ppl_scores) / len(ppl_scores)
print("Average Perplexity:", avg_ppl)


Average Perplexity: 22.861536780300153


In [16]:
import time

latencies = []
for example in test_data:
    prompt = example['input']
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    start_time = time.time()
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False
        )
    end_time = time.time()
    latencies.append(end_time - start_time)
avg_latency = sum(latencies) / len(latencies)
print("Average Latency (seconds):", avg_latency)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more de

Average Latency (seconds): 15.72607342004776


In [22]:
import os

def get_dir_size(path):
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return total

adapter_size = get_dir_size(adapter_path) / (1024 ** 2)

print(f"Adapter size: {adapter_size:.2f} MB")


Adapter size: 176.53 MB


In [23]:
import pandas as pd

metrics = ['ROUGE-1', 'ROUGE-2', 'PPL', 'Latency (s)', 'Model Size (MB)']
values = [0.201933, 0.04892, 22.8, 15.726, 176.53]

description = [
    'Unigram overlap (measures recall of important words)',
    'Bigram overlap (measures recall of important word pairs)',
    'reflects model fluency and prediction quality',
    'Inference time per token',
    'Disk size of the fine-tuned model (smaller is more deployable)'
]

comparative_results_df = pd.DataFrame({
    'Metric': metrics,
    'Value': values,
    'Description': description
})


print(comparative_results_df.to_markdown(index=False))


| Metric          |      Value | Description                                                    |
|:----------------|-----------:|:---------------------------------------------------------------|
| ROUGE-1         |   0.201933 | Unigram overlap (measures recall of important words)           |
| ROUGE-2         |   0.04892  | Bigram overlap (measures recall of important word pairs)       |
| PPL             |  22.8      | reflects model fluency and prediction quality                  |
| Latency (s)     |  15.726    | Inference time per token                                       |
| Model Size (MB) | 176.53     | Disk size of the fine-tuned model (smaller is more deployable) |
