In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
!pip install -q transformers==4.40.2 peft==0.10.0 accelerate==0.29.3 datasets


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))


CUDA Available: True
GPU: Tesla P100-PCIE-16GB


### IMPORTING MODEL

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print("CUDA:", torch.cuda.is_available())

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

CUDA: True




### IMPORTING PEFT (FINE TUNING)

In [15]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.10229075496156657


### DATASET LOADING

In [16]:
from datasets import load_dataset

dataset = load_dataset("databricks/databricks-dolly-15k")
dataset = dataset["train"].shuffle(seed=42).select(range(1500))

split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


In [17]:
def format_prompt(example):
    if example["context"]:
        prompt = f"""### Instruction:
{example['instruction']}

### Context:
{example['context']}

### Response:
{example['response']}"""
    else:
        prompt = f"""### Instruction:
{example['instruction']}

### Response:
{example['response']}"""
    return {"text": prompt}

train_dataset = train_dataset.map(format_prompt)
eval_dataset = eval_dataset.map(format_prompt)


Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [18]:
max_length = 512

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length"
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(
    ["instruction", "context", "response", "category", "text"]
)

eval_dataset = eval_dataset.remove_columns(
    ["instruction", "context", "response", "category", "text"]
)


train_dataset.set_format("torch")
eval_dataset.set_format("torch")


Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [15]:
#print(len(train_dataset))
#print(train_dataset[0])


In [19]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import math

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./lora-tinyllama",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,   # Let Trainer handle AMP
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

trainer.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
200,1.672,1.698811




TrainOutput(global_step=252, training_loss=1.704519740172795, metrics={'train_runtime': 1653.918, 'train_samples_per_second': 2.449, 'train_steps_per_second': 0.152, 'total_flos': 1.2827736812814336e+16, 'train_loss': 1.704519740172795, 'epoch': 2.986666666666667})

In [None]:
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])

print(f"Eval Loss: {eval_results['eval_loss']}")
print(f"Perplexity: {perplexity}")


Eval Loss: 1.6981273889541626
Perplexity: 5.463706409058399


In [None]:
model.save_pretrained("tinyllama-lora-adapter")
tokenizer.save_pretrained("tinyllama-lora-adapter")




('tinyllama-lora-adapter/tokenizer_config.json',
 'tinyllama-lora-adapter/special_tokens_map.json',
 'tinyllama-lora-adapter/tokenizer.model',
 'tinyllama-lora-adapter/added_tokens.json',
 'tinyllama-lora-adapter/tokenizer.json')

## Evaluation and Analysis

In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    torch_dtype=torch.float32,
    device_map="auto"
)

base_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
base_tokenizer.pad_token = base_tokenizer.eos_token



In [27]:
def generate_response(model, tokenizer, prompt, max_new_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [28]:
test_prompts = [
    "Explain the importance of version control in software development.",
    "What is the difference between supervised and unsupervised learning?",
    "How does gradient descent work?",
    "Why is data preprocessing important in ML?",
    "Explain overfitting in neural networks."
]


In [29]:
for prompt in test_prompts:
    formatted_prompt = f"""### Instruction:
{prompt}

### Response:
"""

    print("="*80)
    print("PROMPT:", prompt)
    
    print("\n--- Base Model ---")
    print(generate_response(base_model, base_tokenizer, formatted_prompt))
    
    print("\n--- Fine-Tuned Model ---")
    print(generate_response(model, tokenizer, formatted_prompt))


PROMPT: Explain the importance of version control in software development.

--- Base Model ---
### Instruction:
Explain the importance of version control in software development.

### Response:
Version control is a critical tool in software development that enables developers to track changes made to code, maintain consistency in project documentation, and ensure that the final product is of high quality. Here are some reasons why version control is essential:

1. Consistency: Version control helps to maintain the consistency of code across different branches. Developers can easily revert to previous versions if they encounter issues or make mistakes.

2. Collaboration: Version control enables multiple developers to work on the same project simultaneously, making it easier to collaborate and share code.

3. Deployment: Version control is essential for deploying software to production. It allows developers to track changes made to the code, deploy them to the production environment, and

In [31]:
trainer_base = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## Comparison between Base and finetune V1

In [32]:
print("FOR BASE MODEL")
eval_results = trainer_base.evaluate()
perplexity = math.exp(eval_results["eval_loss"])

print(f"Eval Loss: {eval_results['eval_loss']}")
print(f"Perplexity: {perplexity}")

FOR BASE MODEL


Eval Loss: 2.124366044998169
Perplexity: 8.367591130058685


## experiment and hyper parameter tuning for finding the best lora config

In [8]:
import torch
import math
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

def run_experiment(r_value=8, dataset_size=1500, epochs=3):
    
    print(f"\nRunning Experiment | r={r_value} | dataset={dataset_size} | epochs={epochs}")
    
    # 1️⃣ Load fresh base model
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        device_map="auto"
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    # 2️⃣ Inject LoRA
    lora_config = LoraConfig(
        r=r_value,
        lora_alpha=r_value * 2,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # 3️⃣ Load dataset
    dataset = load_dataset("databricks/databricks-dolly-15k")
    dataset = dataset["train"].shuffle(seed=42).select(range(dataset_size))
    
    split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset["train"]
    eval_dataset = split_dataset["test"]
    
    # 4️⃣ Format prompts
    def format_prompt(example):
        if example["context"]:
            prompt = f"""### Instruction:
{example['instruction']}

### Context:
{example['context']}

### Response:
{example['response']}"""
        else:
            prompt = f"""### Instruction:
{example['instruction']}

### Response:
{example['response']}"""
        return {"text": prompt}
    
    train_dataset = train_dataset.map(format_prompt)
    eval_dataset = eval_dataset.map(format_prompt)
    
    # 5️⃣ Tokenize
    max_length = 512
    
    def tokenize_function(example):
        return tokenizer(
            example["text"],
            truncation=True,
            max_length=max_length,
            padding="max_length"
        )
    
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    eval_dataset = eval_dataset.map(tokenize_function, batched=True)
    
    train_dataset = train_dataset.remove_columns(
        ["instruction", "context", "response", "category", "text"]
    )
    eval_dataset = eval_dataset.remove_columns(
        ["instruction", "context", "response", "category", "text"]
    )
    
    train_dataset.set_format("torch")
    eval_dataset.set_format("torch")
    
    # 6️⃣ Training setup
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    training_args = TrainingArguments(
        output_dir=f"./exp_r{r_value}_{dataset_size}",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=epochs,
        learning_rate=2e-4,
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="no",
        fp16=True,
        report_to="none"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator
    )
    
    trainer.train()
    
    # 7️⃣ Evaluate
    eval_results = trainer.evaluate()
    perplexity = math.exp(eval_results["eval_loss"])
    
    print(f"Eval Loss: {eval_results['eval_loss']}")
    print(f"Perplexity: {perplexity}")
    
    return perplexity


In [6]:
ppl_3000_r16 = run_experiment(r_value=16, dataset_size=3000, epochs=3)


Running Experiment | r=16 | dataset=3000 | epochs=3




config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.20437245579516677


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
0,1.7369,1.702655
1,1.6245,1.690754
2,1.6122,1.688168


Eval Loss: 1.6881675720214844
Perplexity: 5.409558990248326


In [9]:
ppl_1500_r16 = run_experiment(r_value=16, dataset_size=1500, epochs=3)


Running Experiment | r=16 | dataset=1500 | epochs=3




trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.20437245579516677


Map:   0%|          | 0/150 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
0,No log,1.704668
1,1.741100,1.696427
2,1.659700,1.695156


Eval Loss: 1.6951563358306885
Perplexity: 5.447497537698147


In [13]:
ppl_1500_r16

5.447497537698147

In [11]:
model.save_pretrained("tinyllama-lora-adapter")
tokenizer.save_pretrained("tinyllama-lora-adapter")

NameError: name 'model' is not defined

In [None]:
!pip freeze > requirements.txt