Install required packages

In [1]:
!pip install -q bitsandbytes
!pip install -q accelerate
!pip install -q peft transformers datasets tensorboard

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
from datetime import datetime, timedelta
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tensorboard import program
from tqdm.notebook import tqdm as notebook_tqdm

# Disable wandb to prevent it from causing issues
os.environ["WANDB_DISABLED"] = "true"
HUGGINGFACETOKEN = os.environ.get("HUGGINGFACETOKEN")

In [3]:
def load_data_from_json(file_path):
    """Load data from JSON file"""
    print(f"Loading data from {file_path}...")
    with open(file_path, 'r') as f:
        data = json.load(f)
    return {
        'nl': [item['nl'] for item in data],
        'bash': [item['bash'] for item in data]
    }

train_data_dict = load_data_from_json("/kaggle/input/nlp2bash/nl2bash/train.json")
dev_data_dict = load_data_from_json("/kaggle/input/nlp2bash/nl2bash/dev.json")
test_data_dict = load_data_from_json("/kaggle/input/nlp2bash/nl2bash/test.json")

Loading data from /kaggle/input/nlp2bash/nl2bash/train.json...
Loading data from /kaggle/input/nlp2bash/nl2bash/dev.json...
Loading data from /kaggle/input/nlp2bash/nl2bash/test.json...


In [4]:
#Training callbacks
class ProgressCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.training_bar = None
        self.epoch_bar = None
        self.start_time = datetime.utcnow()

    def on_train_begin(self, args, state, control, **kwargs):
        self.total_steps = state.max_steps
        self.training_bar = notebook_tqdm(total=self.total_steps, desc="Training")
        print(f"Training started at {self.start_time} UTC")
        
        # Debug GPU memory at start
        print(f"GPU memory allocated at start: {torch.cuda.memory_allocated()/1e9:.2f} GB")
        print(f"GPU memory reserved at start: {torch.cuda.memory_reserved()/1e9:.2f} GB")

    def on_step_end(self, args, state, control, **kwargs):
        if self.training_bar is not None:
            self.training_bar.update(1)
            if len(state.log_history) > 0:
                loss = state.log_history[-1].get('loss', 0)
                self.training_bar.set_description(f"Training - Loss: {loss:.4f}")
            
            # Calculate ETA
            steps_done = state.global_step
            if steps_done > 0:
                time_elapsed = datetime.utcnow() - self.start_time
                time_per_step = time_elapsed.total_seconds() / steps_done
                steps_remaining = self.total_steps - steps_done
                eta_seconds = time_per_step * steps_remaining
                eta = datetime.utcnow() + timedelta(seconds=eta_seconds)
                self.training_bar.set_postfix(ETA=eta.strftime("%Y-%m-%d %H:%M:%S"))
                
                # Print debug info every 10 steps
                if steps_done % 10 == 0:
                    print(f"\nStep {steps_done}/{self.total_steps}")
                    print(f"GPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB allocated, {torch.cuda.memory_reserved()/1e9:.2f} GB reserved")

    def on_epoch_begin(self, args, state, control, **kwargs):
        if self.epoch_bar is not None:
            self.epoch_bar.close()
        self.epoch_bar = notebook_tqdm(total=args.num_train_epochs, desc="Epochs")
        self.epoch_bar.update(state.epoch)
        print(f"\nStarting epoch {state.epoch+1}/{args.num_train_epochs}")

    def on_train_end(self, args, state, control, **kwargs):
        end_time = datetime.utcnow()
        training_duration = end_time - self.start_time
        print(f"\nTraining completed at {end_time} UTC")
        print(f"Total training time: {training_duration}")
        if self.training_bar is not None:
            self.training_bar.close()
        if self.epoch_bar is not None:
            self.epoch_bar.close()

In [5]:
def prepare_model_for_training():
    """Prepare the model for training with proper quantization and gradient computation"""
    # Define quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-v0.1",
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        offload_folder="offload",
        torch_dtype=torch.bfloat16
    )

    print("Preparing model for k-bit training...")
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA
    print("Configuring LoRA...")
    # Reduce rank from 16 to 8 to save memory
    lora_config = LoraConfig(
        r=4,
        lora_alpha=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # Wrap model with LoRA
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    # Debug memory usage
    print(f"GPU memory after model prep: {torch.cuda.memory_allocated()/1e9:.2f} GB")
    return model

In [6]:
#Data formatting and dataset creation
def format_prompt(nl, bash=None):
    prompt = f"### Instruction: Convert the following English description to a Bash command:\n\n{nl}\n\n### Response:"
    if bash:
        return prompt + f" {bash}"
    return prompt

def create_datasets(train_data_dict, dev_data_dict, tokenizer):
    """Create datasets with proper instruction formatting for NL to Bash conversion"""
    print("Formatting and tokenizing datasets with instruction prompts...")
    
    def prepare_training_data(examples):
        # Format inputs and outputs with proper instruction templates
        formatted_prompts = []
        formatted_completions = []
        
        for nl, bash in zip(examples["nl"], examples["bash"]):
            # Input: just the instruction and NL
            formatted_prompts.append(format_prompt(nl))
            # Output: full prompt with response
            formatted_completions.append(format_prompt(nl, bash))
        
        # Tokenize inputs
        inputs = tokenizer(
            formatted_prompts, 
            padding="max_length", 
            truncation=True, 
            max_length=384,  # Reduced from 512 to save memory
            return_tensors=None
        )
        
        # Tokenize outputs (labels)
        with tokenizer.as_target_tokenizer():
            outputs = tokenizer(
                formatted_completions,
                padding="max_length",
                truncation=True,
                max_length=384,  # Reduced from 512 to save memory
                return_tensors=None
            )
        
        inputs["labels"] = outputs["input_ids"]
        return inputs

    # Print some examples for debugging
    print("\nExample formatted prompt:")
    print(format_prompt(train_data_dict["nl"][0]))
    print("\nExample formatted completion:")
    print(format_prompt(train_data_dict["nl"][0], train_data_dict["bash"][0]))
    
    # Create datasets with progress bars
    train_dataset = Dataset.from_dict(train_data_dict).map(
        prepare_training_data,
        batched=True,
        batch_size=100,  # Process in smaller batches to avoid memory issues
        remove_columns=['nl', 'bash'],
        desc="Processing training data"
    )
    
    dev_dataset = Dataset.from_dict(dev_data_dict).map(
        prepare_training_data,
        batched=True,
        batch_size=100,  # Process in smaller batches
        remove_columns=['nl', 'bash'],
        desc="Processing validation data"
    )
    
    # Print dataset stats
    print(f"Train dataset size: {len(train_dataset)} examples")
    print(f"Validation dataset size: {len(dev_dataset)} examples")
    
    return train_dataset, dev_dataset

In [7]:
#Metrics Calculation
def compute_command_metrics(eval_preds):
    """Calculate metrics for evaluation"""
    logits, labels = eval_preds
    predictions = logits.argmax(-1)
    
    # Decode predictions and labels
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 with tokenizer.pad_token_id before decoding
    labels = labels.copy()
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Extract just the bash commands
    pred_commands = []
    label_commands = []
    for p, l in zip(decoded_preds, decoded_labels):
        try:
            pred_cmd = p.split("### Response:")[-1].strip()
            label_cmd = l.split("### Response:")[-1].strip()
            pred_commands.append(pred_cmd)
            label_commands.append(label_cmd)
        except:
            print(f"Error extracting command from: {p}")
            pred_commands.append("")
            label_commands.append("")
    
    # Calculate exact match accuracy
    exact_matches = sum(1 for p, l in zip(pred_commands, label_commands) if p == l)
    accuracy = exact_matches / len(pred_commands) if pred_commands else 0
    
    return {"command_accuracy": accuracy}

In [8]:
#Bash command generation
def generate_bash_command(model, tokenizer, nl_text, max_length=100):
    """Generate a bash command from natural language"""
    prompt = format_prompt(nl_text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            temperature=0.1,
            top_p=0.75,
            do_sample=True
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    bash_command = response.split("### Response:")[-1].strip()
    return bash_command

In [9]:
#Main training loop
def main():
    # Print start time and user
    start_time = datetime.utcnow()
    print(f"Starting training process at {start_time} UTC")
    print(f"User: {os.getenv('USER', 'Unknown')}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"GPU {i} Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")

    # Output directory
    output_dir = "/kaggle/working/mistral-nl2bash"
    os.makedirs(output_dir, exist_ok=True)
    
    # Save Hugging Face token
    from huggingface_hub.hf_api import HfFolder
    HfFolder.save_token(HUGGINGFACETOKEN)

    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Use full dataset instead of small subset
    print("Preparing datasets...")
    train_dataset, dev_dataset = create_datasets(train_data_dict, dev_data_dict, tokenizer)

    # Prepare model
    model = prepare_model_for_training()

    # Training arguments with increased epochs and learning rate
    print("Configuring training arguments...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="steps",
        eval_steps=200,  # Less frequent evaluation
        save_strategy="steps",
        save_steps=200,
        per_device_train_batch_size=2,  # Increased batch size
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,  # Adjusted accumulation steps
        learning_rate=2e-4,
        num_train_epochs=1,  # Reduced number of epochs
        warmup_ratio=0.1,
        weight_decay=0.01,
        fp16=True,
        logging_steps=10,
        logging_first_step=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=1,
        gradient_checkpointing=True,
        optim="adamw_torch",
        max_grad_norm=0.3,
        report_to=["tensorboard"],
        run_name=f"nl2bash-{datetime.now().strftime('%Y%m%d-%H%M')}"
    )
    # Explicitly calculate total steps
    total_train_batch_size = (
        training_args.per_device_train_batch_size 
        * training_args.gradient_accumulation_steps
    )
    training_args.max_steps = int(len(train_dataset) / total_train_batch_size * training_args.num_train_epochs)
    print(f"Explicitly calculated total steps: {training_args.max_steps}")

    # Initialize TensorBoard
    if training_args.logging_dir:
        tb = program.TensorBoard()
        tb.configure(argv=[None, '--logdir', training_args.logging_dir])
        url = tb.launch()
        print(f"TensorBoard running at: {url}")

    # Initialize trainer with progress callback
    print("Initializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        # Don't use compute_metrics during training to speed things up
        # compute_metrics=compute_command_metrics,
        callbacks=[ProgressCallback()]
    )

    # Print training configuration
    print("\nTraining Configuration:")
    print(f"Total steps: {trainer.state.max_steps}")
    print(f"Batch size: {training_args.per_device_train_batch_size}")
    print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
    print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
    print(f"Number of epochs: {training_args.num_train_epochs}")
    print(f"Learning rate: {training_args.learning_rate}")
    print(f"Training on full dataset: {len(train_dataset)} examples")
    print(f"Validation set size: {len(dev_dataset)} examples")

    # Clear CUDA cache before training
    torch.cuda.empty_cache()
    print(f"GPU memory before training: {torch.cuda.memory_allocated()/1e9:.2f} GB")

    try:
        print("\nAbout to call trainer.train()...")
        trainer.train()
        print("trainer.train() completed")

        print("\nSaving final model...")
        final_output_dir = os.path.join(output_dir, "final_model")
        model.save_pretrained(final_output_dir)
        tokenizer.save_pretrained(final_output_dir)
        
        end_time = datetime.utcnow()
        print(f"\nTraining completed successfully at {end_time} UTC")
        print(f"Total training time: {end_time - start_time}")
        print(f"Model saved to {final_output_dir}")
        
        # Test the model on a few examples
        print("\nTesting model on sample examples:")
        test_samples = test_data_dict["nl"][:3]  # Just test 3 examples
        for i, sample in enumerate(test_samples):
            try:
                result = generate_bash_command(model, tokenizer, sample)
                print(f"Example {i+1}:")
                print(f"NL: {sample}")
                print(f"Generated bash: {result}")
                print("-" * 50)
            except Exception as e:
                print(f"Error generating bash for example {i+1}: {str(e)}")
        
    except Exception as e:
        print(f"\nAn error occurred during training: {str(e)}")
        import traceback
        traceback.print_exc()
        partial_output_dir = os.path.join(output_dir, "partial_model")
        try:
            model.save_pretrained(partial_output_dir)
            tokenizer.save_pretrained(partial_output_dir)
            print(f"Partial model saved to {partial_output_dir}")
        except:
            print("Failed to save partial model")
        raise e

if __name__ == "__main__":
    main()

Starting training process at 2025-03-11 14:17:46.489081 UTC
User: Unknown
CUDA available: True
Number of GPUs: 2
GPU 0: Tesla T4
GPU 0 Memory: 15.83 GB
GPU 1: Tesla T4
GPU 1 Memory: 15.83 GB
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Preparing datasets...
Formatting and tokenizing datasets with instruction prompts...

Example formatted prompt:
### Instruction: Convert the following English description to a Bash command:

Do a dry run of renaming file extension '.andnav' to '.tile' for all files/directories under current directory tree

### Response:

Example formatted completion:
### Instruction: Convert the following English description to a Bash command:

Do a dry run of renaming file extension '.andnav' to '.tile' for all files/directories under current directory tree

### Response: find . -name "*.andnav" | rename -vn "s/\.andnav$/.tile/"


Processing training data:   0%|          | 0/8090 [00:00<?, ? examples/s]



Processing validation data:   0%|          | 0/609 [00:00<?, ? examples/s]

Train dataset size: 8090 examples
Validation dataset size: 609 examples
Loading model...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Preparing model for k-bit training...
Configuring LoRA...
trainable params: 10,485,760 || all params: 7,252,217,856 || trainable%: 0.1446
GPU memory after model prep: 2.00 GB
Configuring training arguments...
Explicitly calculated total steps: 252



NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784



TensorBoard running at: http://localhost:6006/
Initializing trainer...

Training Configuration:
Total steps: 0
Batch size: 2
Gradient accumulation steps: 16
Effective batch size: 32
Number of epochs: 1
Learning rate: 0.0002
Training on full dataset: 8090 examples
Validation set size: 609 examples
GPU memory before training: 2.00 GB

About to call trainer.train()...


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Training started at 2025-03-11 14:19:21.057528 UTC
GPU memory allocated at start: 2.00 GB
GPU memory reserved at start: 2.00 GB


Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...



Starting epoch 1/1


Step,Training Loss,Validation Loss
200,0.6949,0.698744



Step 10/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 20/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 30/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 40/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 50/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 60/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 70/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 80/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 90/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 100/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 110/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 120/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 130/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 140/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 150/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 160/252
GPU memory: 2.06 GB allocated, 3.10 GB reserved

Step 170/252
GPU

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Training completed successfully at 2025-03-11 19:40:38.702327 UTC
Total training time: 5:22:52.213246
Model saved to /kaggle/working/mistral-nl2bash/final_model

Testing model on sample examples:


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Example 1:
NL: Add "prefix_" to every non-blank line in "a.txt"
Generated bash: _"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"_"
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Example 2:
NL: Add the .abc suffix to the names of all *.txt regular files in the current directory tree
Generated bash: ' {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}
--------------------------------------------------
Example 3:
NL: Add cron lists from "filename" to list of cron jobs, giving errors for any lines that cannot be parsed by crontab.
Generated bash: ab crontont crontont crontont crontont crontont crontontont crontontontontontontontontontontontontontontontontontontontontontontontontontontont
--------------------------------------------------
