In [1]:
!pip install torch gradio transformers
!pip install bitsandbytes 
!pip install peft



Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Col

In [2]:
import os
import torch
import wandb

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
)

from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)

from datasets import load_dataset
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# Optional: for training with TRL's SFTTrainer
# from trl import SFTTrainer


In [3]:
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
wb_token = user_secrets.get_secret("wandb")

login(token=hf_token)
wandb.login(key=wb_token)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33muu0712[0m ([33muu0712-engineering-student-council[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

In [5]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model from Hugging Face
model_name = "sparky353454/last_latest_gemma_2_2b_it"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Function to generate response
def generate_answer(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=200)
    
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()

# Create Gradio UI
iface = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(label="Enter Your Math Question"),
    outputs=gr.Textbox(label="Model's Answer"),
    title="Gemma 2B - GSM8K Math Solver",
    description="Enter a mathematical reasoning question, and the fine-tuned Gemma 2B model will generate the answer."
)

# Launch the app
iface.launch()

tokenizer_config.json:   0%|          | 0.00/47.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

Loading adapter weights from sparky353454/last_latest_gemma_2_2b_it led to unexpected keys not found in the model: model.layers.0.self_attn.k_proj.lora_A.default.weight, model.layers.0.self_attn.k_proj.lora_B.default.weight, model.layers.0.self_attn.o_proj.lora_A.default.weight, model.layers.0.self_attn.o_proj.lora_B.default.weight, model.layers.0.mlp.gate_proj.lora_A.default.weight, model.layers.0.mlp.gate_proj.lora_B.default.weight, model.layers.0.mlp.up_proj.lora_A.default.weight, model.layers.0.mlp.up_proj.lora_B.default.weight, model.layers.0.mlp.down_proj.lora_A.default.weight, model.layers.0.mlp.down_proj.lora_B.default.weight, model.layers.1.self_attn.k_proj.lora_A.default.weight, model.layers.1.self_attn.k_proj.lora_B.default.weight, model.layers.1.self_attn.o_proj.lora_A.default.weight, model.layers.1.self_attn.o_proj.lora_B.default.weight, model.layers.1.mlp.gate_proj.lora_A.default.weight, model.layers.1.mlp.gate_proj.lora_B.default.weight, model.layers.1.mlp.up_proj.lora_A

* Running on local URL:  http://127.0.0.1:7860
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://512363396c8527c719.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


In [None]:
# Jack is 4 years older than Jill. In 6 years, Jack will be twice as old as Jill. How old are they now?
# A car travels 150 km on 10 liters of fuel. How far can it travel on 25 liters?

# The sum of three consecutive odd numbers is 75. What are the numbers?

# If 5 workers complete a task in 12 days, how many days would it take for 10 workers to do the same task?

In [None]:
#basemodel test using grado 

In [None]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model
model_name = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Function to generate response
def generate_answer(question):
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=512,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    answer = decoded_output[len(prompt):].strip()
    return answer

# Gradio UI
iface = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(label="Enter Your Math Question"),
    outputs=gr.Textbox(label="Model's Answer"),
    title="Gemma 2B - GSM8K Math Solver",
    description="Enter a mathematical reasoning question, and the fine-tuned Gemma 2B model will generate the answer."
)

iface.launch()


In [None]:
#benchmarking 

In [5]:
import getpass

# Prompt for GitHub token
token = getpass.getpass("Enter your GitHub token (starts with 'ghp_' or 'github_pat_...'): ")

# Use EleutherAI repo instead of OpenAI
repo_url = f"https://{token}:x-oauth-basic@github.com/EleutherAI/lm-evaluation-harness.git"

# Clone it
!git clone {repo_url}


Enter your GitHub token (starts with 'ghp_' or 'github_pat_...'):  ········


Cloning into 'lm-evaluation-harness'...
remote: Enumerating objects: 49683, done.[K
remote: Counting objects: 100% (99/99), done.[K
remote: Compressing objects: 100% (91/91), done.[K
remote: Total 49683 (delta 74), reused 8 (delta 8), pack-reused 49584 (from 3)[K
Receiving objects: 100% (49683/49683), 29.69 MiB | 21.16 MiB/s, done.
Resolving deltas: 100% (34367/34367), done.


In [None]:
#installation

In [6]:
%cd lm-evaluation-harness
!pip install -e .


/kaggle/working/lm-evaluation-harness
Obtaining file:///kaggle/working/lm-evaluation-harness
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting evaluate (from lm_eval==0.4.8)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jsonlines (from lm_eval==0.4.8)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pytablewriter (from lm_eval==0.4.8)
  Downloading pytablewriter-1.2.1-py3-none-any.whl.metadata (38 kB)
Collecting rouge-score>=0.0.4 (from lm_eval==0.4.8)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu>=1.5.0 (from lm_eval==0.4.8)
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
#benchmarks code for finetuned model

In [None]:
!python -m lm_eval \
  --model hf \
  --model_args pretrained=sparky353454/last_latest_gemma_2_2b_it,revision=main,use_auth_token=True \
  --tasks gsm8k \
  --device cuda \
  --batch_size auto \
  --output_path results.json \
  --log_samples

#run in console 

In [None]:
#hiridharian code for finetuned model

In [None]:
!python -m lm_eval \
  --model hf \
  --model_args pretrained=sparky353454/last_latest_gemma_2_2b_it,revision=main,use_auth_token=True \
  --tasks gsm8k \
  --num_fewshot 1 \
  --device cuda:auto \
  --batch_size auto \
  --output_path ./HUMANEVAL_01.json \
  --confirm_run_unsafe_code

In [None]:
#hiridharian code for base model

In [None]:
!python -m lm_eval \
  --model hf \
  --model_args pretrained=google/gemma-2-2b-it,revision=main,use_auth_token=True \
  --tasks gsm8k \
  --num_fewshot 1 \
  --device cuda:auto \
  --batch_size auto \
  --output_path ./HUMANEVAL_01.json \
  --confirm_run_unsafe_code

In [None]:
# old fine tuned benchmarks

In [None]:
Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.2737|±  |0.0123|
|     |       |strict-match    |     5|exact_match|↑  |0.2858|±  |0.0124|


In [None]:
#time latency test 

In [9]:
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model_and_tokenizer(model_name, use_auth_token=True):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=use_auth_token)
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=use_auth_token, torch_dtype=torch.float16)
    model.to("cuda")
    model.eval()
    return model, tokenizer

def measure_latency(model, tokenizer, prompt, n_trials=20, max_new_tokens=50):
    times = []
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Warm-up run
    _ = model.generate(**inputs, max_new_tokens=max_new_tokens)

    for _ in range(n_trials):
        torch.cuda.synchronize()
        start_time = time.time()
        _ = model.generate(**inputs, max_new_tokens=max_new_tokens)
        torch.cuda.synchronize()
        elapsed = time.time() - start_time
        times.append(elapsed)

    avg_time = sum(times) / n_trials
    return avg_time, times

# Define models
base_model_name = "google/gemma-2-2b-it"
finetuned_model_name = "sparky353454/last_latest_gemma_2_2b_it"
prompt = "Solve the following problem: If you have 10 apples and give away 3, how many apples do you have?"

# Load models
base_model, base_tokenizer = load_model_and_tokenizer(base_model_name)
finetuned_model, finetuned_tokenizer = load_model_and_tokenizer(finetuned_model_name)

# Measure latency
base_avg, base_times = measure_latency(base_model, base_tokenizer, prompt)
finetuned_avg, finetuned_times = measure_latency(finetuned_model, finetuned_tokenizer, prompt)

# Output
print(f"Base Model Average Latency: {base_avg:.4f} seconds")
print(f"Fine-Tuned Model Average Latency: {finetuned_avg:.4f} seconds")




tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapter weights from sparky353454/last_latest_gemma_2_2b_it led to unexpected keys not found in the model: model.layers.0.self_attn.k_proj.lora_A.default.weight, model.layers.0.self_attn.k_proj.lora_B.default.weight, model.layers.0.self_attn.o_proj.lora_A.default.weight, model.layers.0.self_attn.o_proj.lora_B.default.weight, model.layers.0.mlp.gate_proj.lora_A.default.weight, model.layers.0.mlp.gate_proj.lora_B.default.weight, model.layers.0.mlp.up_proj.lora_A.default.weight, model.layers.0.mlp.up_proj.lora_B.default.weight, model.layers.0.mlp.down_proj.lora_A.default.weight, model.layers.0.mlp.down_proj.lora_B.default.weight, model.layers.1.self_attn.k_proj.lora_A.default.weight, model.layers.1.self_attn.k_proj.lora_B.default.weight, model.layers.1.self_attn.o_proj.lora_A.default.weight, model.layers.1.self_attn.o_proj.lora_B.default.weight, model.layers.1.mlp.gate_proj.lora_A.default.weight, model.layers.1.mlp.gate_proj.lora_B.default.weight, model.layers.1.mlp.up_proj.lora_A

Base Model Average Latency: 0.5616 seconds
Fine-Tuned Model Average Latency: 3.1158 seconds


In [None]:
# Base Model Average Latency: 0.5616 seconds
# Fine-Tuned Model Average Latency: 3.1158 seconds

In [None]:
#1 shot finetuned

In [None]:
hf (pretrained=sparky353454/last_latest_gemma_2_2b_it,revision=main,use_auth_token=True), gen_kwargs: (None), limit: None, num_fewshot: 1, batch_size: auto
|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k|      3|flexible-extract|     1|exact_match|↑  |0.5019|±  |0.0138|
|     |       |strict-match    |     1|exact_match|↑  |0.0910|±  |0.0079|



In [None]:
#5-shot finetuned

In [None]:
hf (pretrained=sparky353454/last_latest_gemma_2_2b_it,revision=main,use_auth_token=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto
|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.4617|±  |0.0137|
|     |       |strict-match    |     5|exact_match|↑  |0.4473|±  |0.0137|


In [None]:
# base model benchmarks 

In [None]:
#5 -shot basemodel bench marks

In [None]:
|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.4556|±  |0.0137|
|     |       |strict-match    |     5|exact_match|↑  |0.4496|±  |0.0137|


In [None]:
#1 shot benchmarking basemodel

In [None]:
|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k|      3|flexible-extract|     1|exact_match|↑  |0.5118|±  |0.0138|
|     |       |strict-match    |     1|exact_match|↑  |0.1084|±  |0.0086|


In [7]:
!python -m lm_eval \
  --model hf \
  --model_args pretrained=google/gemma-2-2b-it,revision=main,use_auth_token=True \
  --tasks gsm8k \
  --num_fewshot 1 \
  --device cuda:auto \
  --batch_size auto \
  --output_path ./HUMANEVAL_01.json \
  --confirm_run_unsafe_code

2025-04-24 06:45:10.142988: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-24 06:45:10.165794: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-24 06:45:10.172180: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
config.json: 100%|█████████████████████████████| 838/838 [00:00<00:00, 4.72MB/s]
tokenizer_config.json: 100%|███████████████| 47.0k/47.0k [00:00<00:00, 4.36MB/s]
tokenizer.model: 100%|█████████████████████| 4.24M/4.24M [00:00<00:00, 45.6MB/s]
tokenizer.json: 100%|███████████████████████| 17.5M/17.5M [00:00<00:00, 214MB/s]
special_tokens_map.json: 100%|███████████