In [1]:
!pip install -q -U transformers==4.41.0
!pip install -q -U bitsandbytes==0.43.1
!pip install -q -U scipy
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
!pip install bitsandbytes==0.43.3
!pip install transformers==4.44.0 datasets==2.20.0 accelerate==0.33.0
!pip install peft==0.12.0 trl==0.9.6
!pip install scipy sentencepiece

print("‚úì Installation complete!")

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting bitsandbytes==0.43.3
  Using cached bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
Installing collected packages: bitsandbytes
  Attempting uninstall: bitsandbytes
    Found existing installation: bitsandbytes 0.43.1
    Uninstalling bitsandbytes-0.43.1:
      Successfully uninstalled bitsandbytes-0.43.1
Successfully installed bitsandbytes-0.43.3
Collecting transformers==4.44.0
  Using cached transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.44.0-py3-none-any.whl (9.5 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.0
    Uninstalling transformers-4.41.0:
      Successfully uninstalled transformers-4.41.0
Successfully installed transformers-4.44.0
‚úì Installation complete!


In [2]:
import os
import sys
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define your Project Root
project_root_path = '/content/drive/MyDrive/Operation Ledger Mind'

# 3. Change the working directory to the root
os.chdir(project_root_path)

# 4. Add the root to Python's search path so it finds 'src'
if project_root_path not in sys.path:
    sys.path.append(project_root_path)

# 5. Verify it works
print(f"Current working directory: {os.getcwd()}")
print("Setup complete. You can now import from 'src'!")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Operation Ledger Mind
Setup complete. You can now import from 'src'!


In [3]:
import os
import json
import torch
from pathlib import Path
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

# Configuration
config = {
    "model_name": "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "train_data_path": "train.jsonl",
    "output_dir": "models/adapters",
    "max_seq_length": 512,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-4,
    "max_steps": 100,
    'hf_username': 'UdayangaWeerakoon',
    'hub_model_name': 'uber-llama-3',
}

# Create output directory
Path(config["output_dir"]).mkdir(parents=True, exist_ok=True)

print(f"üöÄ Fine-tuning Configuration:")
for key, value in config.items():
    print(f"   {key}: {value}")

üöÄ Fine-tuning Configuration:
   model_name: unsloth/llama-3-8b-Instruct-bnb-4bit
   train_data_path: train.jsonl
   output_dir: models/adapters
   max_seq_length: 512
   num_train_epochs: 1
   per_device_train_batch_size: 1
   gradient_accumulation_steps: 4
   learning_rate: 0.0002
   max_steps: 100
   hf_username: UdayangaWeerakoon
   hub_model_name: uber-llama-3


In [4]:
def load_jsonl(filepath: str):
    """Load JSONL file into a list of dictionaries."""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load training data
train_data = load_jsonl(config["train_data_path"])
print(f"‚úÖ Loaded {len(train_data)} training examples")

# Display sample
print(f"\nüìã Sample Training Example:")
print(f"   Question: {train_data[0]['question']}")
print(f"   Answer: {train_data[0]['answer'][:100]}...")

‚úÖ Loaded 960 training examples

üìã Sample Training Example:
   Question: What trends in shareholder returns might arise from Uber's share repurchase initiative?
   Answer: Based on the provided context, Uber's board of directors authorized a share repurchase of up to $7.0...


In [5]:
def format_instruction(example: dict) -> str:
    """
    Format Q/A pair into Llama-3 instruction format.
    """
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a financial analyst expert on Uber Technologies. Answer questions accurately based on the 2024 Annual Report.<|eot_id|><|start_header_id|>user<|end_header_id|>

{example['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['answer']}<|eot_id|>"""

# Format all training examples
formatted_data = [
    {"text": format_instruction(example)}
    for example in train_data
]

# Create HuggingFace dataset
dataset = Dataset.from_list(formatted_data)

print(f"\nüì¶ Dataset prepared: {len(dataset)} examples")
print(f"\nüìù Formatted Example:")
print(dataset[0]["text"][:300] + "...")


üì¶ Dataset prepared: 960 examples

üìù Formatted Example:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a financial analyst expert on Uber Technologies. Answer questions accurately based on the 2024 Annual Report.<|eot_id|><|start_header_id|>user<|end_header_id|>

What trends in shareholder returns might arise from Uber's share repur...


In [6]:
# BitsAndBytes 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Enable 4-bit loading
    bnb_4bit_quant_type="nf4",              # Use NF4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16
    bnb_4bit_use_double_quant=True,         # Double quantization for extra compression
)

print("‚úÖ Quantization config:")
print(f"   Type: 4-bit NF4")
print(f"   Double quantization: Enabled")
print(f"   Compute dtype: bfloat16")

‚úÖ Quantization config:
   Type: 4-bit NF4
   Double quantization: Enabled
   Compute dtype: bfloat16


In [None]:
from huggingface_hub import login

login(token="YOUR_HF_TOKEN")

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    config["model_name"],
    trust_remote_code=True,
    padding_side="right",  # Required for decoder-only models
)

# Set pad token (Llama models don't have one by default)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Tokenizer loaded: {config["model_name"]}")
print(f"Vocab size: {len(tokenizer)}")
print(f"PAD token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Tokenizer loaded: unsloth/llama-3-8b-Instruct-bnb-4bit
Vocab size: 128256
PAD token: <|eot_id|>
EOS token: <|eot_id|>


In [11]:
# Load base model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    config["model_name"],
    quantization_config=bnb_config,
    device_map="auto",  # Automatically distribute model across available GPUs
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

print(f"\nModel loaded: {config["model_name"]}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

# Print model memory footprint
if torch.cuda.is_available():
    print(f"\nGPU Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU Memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

config.json: 0.00B [00:00, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]


Model loaded: unsloth/llama-3-8b-Instruct-bnb-4bit
Model device: cuda:0
Model dtype: torch.float32

GPU Memory allocated: 7.27 GB
GPU Memory reserved: 9.23 GB


In [12]:
# LoRA configuration targeting attention projection layers
lora_config = LoraConfig(
    r=16,                              # Rank of the low-rank matrices
    lora_alpha=32,                     # Scaling factor (typically 2*r)
    target_modules=[                   # Attention modules to apply LoRA
        "q_proj",                      # Query projection
        "k_proj",                      # Key projection
        "v_proj",                      # Value projection
        "o_proj",                      # Output projection
    ],
    lora_dropout=0.05,                 # Dropout probability for LoRA layers
    bias="none",                       # Don't train bias parameters
    task_type="CAUSAL_LM",             # Task type for causal language modeling
)

print("LoRA Configuration:")
print(f"  - Rank (r): {lora_config.r}")
print(f"  - Alpha: {lora_config.lora_alpha}")
print(f"  - Target modules: {lora_config.target_modules}")
print(f"  - Dropout: {lora_config.lora_dropout}")
print(f"  - Task type: {lora_config.task_type}")

LoRA Configuration:
  - Rank (r): 16
  - Alpha: 32
  - Target modules: {'v_proj', 'k_proj', 'o_proj', 'q_proj'}
  - Dropout: 0.05
  - Task type: CAUSAL_LM


In [13]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

print("\nLoRA adapters added successfully!")

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695

LoRA adapters added successfully!


In [15]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    max_steps=config["max_steps"],
    per_device_train_batch_size=config["per_device_train_batch_size"],
    gradient_accumulation_steps=config["gradient_accumulation_steps"],
    learning_rate=config["learning_rate"],
    num_train_epochs=config["num_train_epochs"],
    gradient_checkpointing=True,

    # Optimization
    optim="paged_adamw_8bit",
    fp16=False,
    bf16=True,

    # Logging
    logging_steps=10,
    logging_dir=f"{config['output_dir']}/logs",

    # Checkpointing
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,

    # Other
    warmup_steps=10,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    report_to="none",
    group_by_length=True,
    seed=42,
)

print("‚úÖ Training arguments configured")

‚úÖ Training arguments configured


In [16]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    tokenizer=tokenizer,
    dataset_text_field="text",
    peft_config=lora_config,
    max_seq_length=config["max_seq_length"],
    packing=False,
)

print("‚úÖ SFTTrainer initialized and ready to train")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/960 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


‚úÖ SFTTrainer initialized and ready to train


In [17]:
print("\nüöÄ Starting training...\n")

# Train
train_result=trainer.train()

print("\n‚úÖ Training complete!")

print(f"Total training time: {train_result.metrics.get('train_runtime', 0):.2f}s")
print(f"Samples per second: {train_result.metrics.get('train_samples_per_second', 0):.2f}")
print(f"Steps per second: {train_result.metrics.get('train_steps_per_second', 0):.4f}")


üöÄ Starting training...



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,2.8553
20,1.7287
30,1.4899
40,1.3384
50,1.2037
60,1.4289
70,1.4041
80,1.3684
90,1.3507
100,1.1201


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]



‚úÖ Training complete!
Total training time: 1975.61s
Samples per second: 0.20
Steps per second: 0.0510


In [20]:
# Save LoRA adapters
trainer.model.save_pretrained(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

print(f"\nLoRA adapters saved to: {config["output_dir"]}")
print(f"Tokenizer saved to: {config["output_dir"]}")

model_card = f"""---
license: llama3
base_model: meta-llama/Meta-Llama-3-8B-Instruct
tags:
- llama3
- lora
- qlora
- instruction-tuning
- uber
- roleplay
- intern
---

# Uber Intern ‚Äî "The Intern" LoRA Adapter

This is a **LoRA adapter** fine-tuned on top of **Llama-3-8B-Instruct** to emulate an **Uber intern** who is familiar with Uber's 2024 strategy, internal tone, communication style, priorities, and typical ways of responding to questions.

The model has been trained to answer questions in the style of a helpful, enthusiastic, slightly junior-but-eager Uber intern.

## Model Details

**Base Model**
unsloth/llama-3-8b-Instruct-bnb-4bit

**Adapter Type**
LoRA (Low-Rank Adaptation) + 4-bit QLoRA quantization during training

**Fine-tuning Goal**
Transform a generic model into "The Intern" ‚Äî someone who knows Uber's 2024 direction, speaks in Uber's internal tone, and answers questions helpfully as if they were a real intern.

## Training Setup

- **Quantization**: 4-bit NormalFloat (NF4) with double quantization (via `BitsAndBytesConfig`)
- **LoRA Config**:
  - Target modules: `q_proj`, `k_proj`, `v_proj`, `o_proj`
  - Rank (`r`): [your value ‚Äî e.g. 16, 32, 64]
  - Alpha: [your value ‚Äî e.g. 16 or 32]
  - Dropout: [your value ‚Äî usually 0.05 or 0.1]
- **Trainer**: `trl.SFTTrainer`
- **Training steps**: minimum 100 steps (as per task constraint)
- **Context length**: [your value ‚Äî e.g. 8192 or 16384]
- **Effective batch size**: [your value ‚Äî e.g. 4 / 8 / 16 depending on gradient accumulation]
- **Optimizer / LR**: [e.g. AdamW 8-bit, learning rate 1e-4 or 5e-5]

## Intended Use

- Asking questions as if talking to an Uber intern
- Simulating internal Uber discussions / brainstorming
- Learning about Uber's 2024 strategy in a conversational way
- Role-playing / creative writing in Uber context
- Educational / research / fun purposes

## Usage Example

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model (use quantization if desired)
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
adapter_path = "your-username/uber-intern-lora"          # ‚Üê your repo name

tokenizer = AutoTokenizer.from_pretrained(adapter_path)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    load_in_4bit=True,              # optional: keep quantized
    trust_remote_code=True
)

model = PeftModel.from_pretrained(model, adapter_path)

# Simple inference wrapper
def query_intern(question: str, max_new_tokens=300):
    prompt = f"You are an enthusiastic Uber intern in 2024. Answer in a helpful, junior-but-smart tone.\\n\\nQuestion: question\\n\\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Clean up the prompt part
    return response.split("Answer:")[-1].strip()

# Example
print(query_intern("What's the main focus of Uber's mobility strategy in 2024?"))
"""

with open(os.path.join(config["output_dir"], "README.md"), "w") as f:
    f.write(model_card)

print("‚úÖ Model card saved")


LoRA adapters saved to: models/adapters
Tokenizer saved to: models/adapters
‚úÖ Model card saved


In [21]:
# Push to Hub (if enabled)
PUSH_TO_HUB = True
if PUSH_TO_HUB:
    # Create full repo name: username/model-name
    repo_name = f"{config['hf_username']}/{config['hub_model_name']}"

    print(f"\nüì§ Pushing to Hugging Face Hub: {repo_name}")
    print("   This will create a new model repository if it doesn't exist.")

    trainer.model.push_to_hub(repo_name)
    tokenizer.push_to_hub(repo_name)

    print(f"‚úÖ Pushed to Hub: https://huggingface.co/{repo_name}")
else:
    print("\n‚ÑπÔ∏è push_to_hub=False, skipping Hub upload")
    print(f"   To push to Hub, set CONFIG['push_to_hub'] = True")
    print(f"   Model will be pushed as: {config['hf_username']}/{config['hub_model_name']}")


üì§ Pushing to Hugging Face Hub: UdayangaWeerakoon/uber-llama-3
   This will create a new model repository if it doesn't exist.


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   1%|1         |  557kB / 54.6MB            

README.md: 0.00B [00:00, ?B/s]

‚úÖ Pushed to Hub: https://huggingface.co/UdayangaWeerakoon/uber-llama-3
