# CLI Agent Model Training - Google Colab



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

CUDA available: True
GPU: NVIDIA L4
CUDA version: 12.4


In [None]:
# Install required packages
!pip install -q torch transformers peft datasets accelerate bitsandbytes tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import libraries
import os
import json
import torch
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Environment setup complete!")

Environment setup complete!


## Step 1: Upload and Load Training Data



In [None]:
# Upload training data file
from google.colab import files

print("Please upload your processed_training_data.json file:")
uploaded = files.upload()

# Check uploaded file
if 'processed_training_data.json' in uploaded:
    print("✅ Training data uploaded successfully!")
    with open('processed_training_data.json', 'r') as f:
        training_data = json.load(f)
    print(f"📊 Loaded {len(training_data)} training examples")
else:
    print("❌ Please upload processed_training_data.json")

Please upload your processed_training_data.json file:


Saving processed_training_data.json to processed_training_data.json
✅ Training data uploaded successfully!
📊 Loaded 68 training examples


In [None]:
# Explore the training data
print("📋 Data exploration:")
print(f"Total examples: {len(training_data)}")

# Show sample
print("\n📝 Sample training example:")
sample = training_data[0]
print(f"Instruction: {sample['instruction']}")
print(f"Input: {sample['input']}")
print(f"Output: {sample['output']}")
print(f"Topic: {sample['topic']}")

# Topic distribution
topics = {}
for item in training_data:
    topic = item['topic']
    topics[topic] = topics.get(topic, 0) + 1

print("\n🏷️ Topic distribution:")
for topic, count in sorted(topics.items()):
    print(f"  {topic}: {count}")

📋 Data exploration:
Total examples: 68

📝 Sample training example:
Instruction: Answer this command-line question: git-upload-pack: command not found
Input: 
Output: Use PATH does not return /usr/local/bin. (It returns /usr/bin:/bin:/usr/sbin:/sbin). But when I look at my .bashrc file, it contains: export PATH=/usr/local/bin:$PATH So now I'm confused. What do I need to do to avoid using the -u /usr/local/bin/git-upload-pack option every time? Why does ssh you@remotemachine echo \$PATH not return /usr/local/bin? Is this something to do with login and non-login shells? Please help! Thanks in advance. command. I've read this answer about eight-five times, but there's something I'm not understanding correctly: git-upload-pack: command not found, how to fix this correctly When I try to clone a repository on m...
Topic: git

🏷️ Topic distribution:
  bash: 6
  cd: 1
  chmod: 1
  cp: 1
  file: 28
  find: 2
  git: 8
  grep: 1
  head: 1
  ls: 1
  mkdir: 1
  mv: 1
  python: 7
  rm: 1
  system: 6


## Step 2: Load and Configure Model

In [None]:
# Model configuration
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"🔧 Loading model: {model_name}")
print(f"🖥️ Using device: {device}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="right"
)

# Add pad token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"✅ Tokenizer loaded. Vocab size: {len(tokenizer)}")

🔧 Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
🖥️ Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

✅ Tokenizer loaded. Vocab size: 32000


In [None]:
# Load base model
print("📦 Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False  # Disable for training
)

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,  # rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Apply LoRA
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

print("✅ Model loaded and LoRA applied!")

📦 Loading base model...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.5701
✅ Model loaded and LoRA applied!


## Step 3: Prepare Training Data

In [None]:
# Format data for instruction tuning
def format_training_data(data):
    formatted_data = []

    for item in data:
        # Create instruction format similar to Alpaca
        if item['input'].strip():
            prompt = f"### Instruction:\n{item['instruction']}\n\n### Input:\n{item['input']}\n\n### Response:\n{item['output']}"
        else:
            prompt = f"### Instruction:\n{item['instruction']}\n\n### Response:\n{item['output']}"

        formatted_data.append({"text": prompt})

    return formatted_data

# Format the training data
formatted_data = format_training_data(training_data)
print(f"📝 Formatted {len(formatted_data)} training examples")

# Show sample formatted text
print("\n📄 Sample formatted prompt:")
print(formatted_data[0]['text'][:500] + "...")

📝 Formatted 68 training examples

📄 Sample formatted prompt:
### Instruction:
Answer this command-line question: git-upload-pack: command not found

### Response:
Use PATH does not return /usr/local/bin. (It returns /usr/bin:/bin:/usr/sbin:/sbin). But when I look at my .bashrc file, it contains: export PATH=/usr/local/bin:$PATH So now I'm confused. What do I need to do to avoid using the -u /usr/local/bin/git-upload-pack option every time? Why does ssh you@remotemachine echo \$PATH not return /usr/local/bin? Is this something to do with login and non-logi...


In [None]:
# Create dataset and tokenize
dataset = Dataset.from_list(formatted_data)

def tokenize_function(examples):
    # Tokenize the text
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

print("🔄 Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

print(f"✅ Tokenization complete! Dataset size: {len(tokenized_dataset)}")

🔄 Tokenizing dataset...


Map:   0%|          | 0/68 [00:00<?, ? examples/s]

✅ Tokenization complete! Dataset size: 68


## Step 4: Training Configuration and Training

In [None]:
# Training arguments
output_dir = "./cli_agent_adapters"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=500,  # Limit steps for quick training
    learning_rate=1e-4,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=None,
    load_best_model_at_end=False,
    ddp_find_unused_parameters=False,
    dataloader_pin_memory=False,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    fp16=True,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

print("⚙️ Training configuration ready!")
print(f"📊 Training steps: {training_args.max_steps}")
print(f"🎯 Learning rate: {training_args.learning_rate}")
print(f"📦 Batch size: {training_args.per_device_train_batch_size}")

⚙️ Training configuration ready!
📊 Training steps: 500
🎯 Learning rate: 0.0001
📦 Batch size: 4


In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("🏋️ Trainer created successfully!")
print("🚀 Ready to start training...")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🏋️ Trainer created successfully!
🚀 Ready to start training...


In [None]:
# Start training
print("🔥 Starting training...")
print("⏱️ This will take approximately 20-40 minutes on T4 GPU")

# Train the model
trainer.train()

print("🎉 Training completed!")

🔥 Starting training...
⏱️ This will take approximately 20-40 minutes on T4 GPU


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msusanta-baidya[0m ([33msid-deepmatrix-deepmatrix[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.424
20,2.1397
30,1.7553
40,1.4163
50,1.2127
60,1.0689
70,0.7545
80,0.524
90,0.2333
100,0.1326


🎉 Training completed!


In [None]:
# Save the trained adapter
print("💾 Saving trained adapter...")

# Save model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Model saved to {output_dir}")

# List saved files
import os
print("\n📁 Saved files:")
for file in os.listdir(output_dir):
    print(f"  {file}")

💾 Saving trained adapter...
✅ Model saved to ./cli_agent_adapters

📁 Saved files:
  checkpoint-400
  training_args.bin
  tokenizer_config.json
  adapter_model.safetensors
  checkpoint-500
  special_tokens_map.json
  README.md
  tokenizer.model
  chat_template.jinja
  tokenizer.json
  runs
  adapter_config.json


## Step 5: Test the Trained Model

In [None]:
# Test the trained model
def test_model(prompt):
    # Format prompt
    formatted_prompt = f"### Instruction:\nAnswer this command-line question: {prompt}\n\n### Response:\n"

    # Tokenize
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated = response[len(formatted_prompt):].strip()

    return generated

# Test with sample prompts
test_prompts = [
    "Create a new Git branch and switch to it",
    "List all Python files recursively",
    "Compress a folder with tar and gzip"
]

print("🧪 Testing the trained model:\n")
for prompt in test_prompts:
    print(f"❓ {prompt}")
    response = test_model(prompt)
    print(f"💬 {response}")
    print("-" * 50)

🧪 Testing the trained model:

❓ Create a new Git branch and switch to it
💬 Use git log --oneline to see a condensed view of the log message. I want to create a new branch with a long commit message. command. I have been trying to use git log --oneline to see a condensed view of the log message. I want to create a new branch with a long commit message. I know I can use git switch -c branchname to switch to a specific branch, but how do I create a new branch with a long commit message? When I use git log --oneline to see a condensed view of the log message, it just shows the first 50 characters of the commit message. How can I make Git do this? I want to use this command often, and having
--------------------------------------------------
❓ List all Python files recursively
💬 Use `ls **/*.py` to list all Python files recursively. command. I want to run this command: python3 list.py `ls **/*.py` To run this script in a terminal/command line. I want to change the second command line argume

## Step 6: Download Trained Model

In [None]:
# Create a zip file of the trained adapter
import shutil

print("📦 Creating adapter archive...")

# Create zip file
shutil.make_archive('cli_agent_adapters', 'zip', output_dir)

print("✅ Archive created: cli_agent_adapters.zip")

# Get file size
file_size = os.path.getsize('cli_agent_adapters.zip') / (1024 * 1024)  # MB
print(f"📏 Archive size: {file_size:.1f} MB")

📦 Creating adapter archive...
✅ Archive created: cli_agent_adapters.zip
📏 Archive size: 157.7 MB


In [None]:
# Download the trained adapter
from google.colab import files

print("⬇️ Downloading trained adapter...")
files.download('cli_agent_adapters.zip')

print("🎉 Download complete!")
print("\n📋 Next steps:")
print("1. Extract the zip file in your project directory")
print("2. Place contents in training/adapters/")
print("3. Run your CLI agent with: python agent.py 'your instruction'")

⬇️ Downloading trained adapter...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

🎉 Download complete!

📋 Next steps:
1. Extract the zip file in your project directory
2. Place contents in training/adapters/
3. Run your CLI agent with: python agent.py 'your instruction'
