In [1]:
!pip -q install pandas scikit-learn 

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
!pip install trl 

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting accelerate>=1.4.0 (from trl)
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting transformers>=4.56.1 (from trl)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting huggingface_hub>=0.21.0 (from accelerate>=1.4.0->trl)
  Downloading huggingface_hub-0.35.1-py3-none-any.whl.metadata (14 kB)
Collecting safetensors>=0.4.3 (from accelerate>=1.4.0->trl)
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting pyarrow>=21.0.0 (from datasets>=3.0.0->trl)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets>=3.0.0->trl)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting tqdm>=4.66.3 (from datasets>=3.0.0->t

In [2]:
!pip install transformers peft datasets

Collecting transformers
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting peft
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.35.1-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.9.18-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting accel

In [3]:
!pip -q install huggingface_hub

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
from huggingface_hub import hf_hub_download
from huggingface_hub import login

login(token="")

In [5]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset as HFDataset
import os
from typing import Dict, List
import json
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from transformers import LogitsProcessor

class SafeLogitsProcessor(LogitsProcessor):
    def __call__(self, input_ids, scores):
        scores = torch.nan_to_num(scores, nan=-1e9, posinf=1e9, neginf=-1e9)
        return scores

class KubernetesCommandDataset(Dataset):
    def __init__(self, questions: List[str], commands: List[str], tokenizer, max_length: int = 512):
        self.questions = questions
        self.commands = commands
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        command = self.commands[idx]
        
        # Format the input as instruction-following format
        prompt = f"### Instruction:\nTranslate the following natural language request to a Kubernetes kubectl command:\n\n### Input:\n{question}\n\n### Response:\n{command}"
        
        # Tokenize the prompt
        encoding = self.tokenizer(
            prompt,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }

class KubernetesCommandTrainer:
    def __init__(self, model_name: str = "meta-llama/Llama-3.2-3B-Instruct"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
    def load_data(self, csv_path: str):
        """Load and preprocess the CSV data"""
        print("Loading data...")
        df = pd.read_csv(csv_path)
        
        questions = df['question'].tolist()
        commands = df['command'].tolist()
        
        valid_pairs = [(q, c) for q, c in zip(questions, commands) if pd.notna(q) and pd.notna(c)]
        questions, commands = zip(*valid_pairs)
        
        print(f"Loaded {len(questions)} training examples")
        return list(questions), list(commands)
    
    def setup_model_and_tokenizer(self):
        """Initialize the model and tokenizer"""
        print("Loading tokenizer and model...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,  # Use half precision to save memory
            device_map="auto",
            trust_remote_code=True
        )
        
        self.model.gradient_checkpointing_enable()
        self.model.config.use_cache = False 
        
        print(f"Model loaded with {self.model.num_parameters():,} parameters")
        
    def prepare_datasets(self, questions: List[str], commands: List[str], test_size: float = 0.2):
        """Prepare train and validation datasets"""
        print("Preparing datasets...")
        
        train_q, val_q, train_c, val_c = train_test_split(
            questions, commands, test_size=test_size, random_state=42
        )
        
        train_dataset = KubernetesCommandDataset(train_q, train_c, self.tokenizer)
        val_dataset = KubernetesCommandDataset(val_q, val_c, self.tokenizer)
        
        print(f"Training samples: {len(train_dataset)}")
        print(f"Validation samples: {len(val_dataset)}")
        
        return train_dataset, val_dataset
    
    def train(self, train_dataset, val_dataset, output_dir: str = "./k8s-command-model"):
        """Train the model"""
        print("Starting training...")
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=1,  # Small batch size to fit in memory
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=8,  # Simulate larger batch size
            warmup_steps=100,
            logging_steps=10,
            eval_strategy="steps",
            eval_steps=50,
            save_steps=100,
            learning_rate=2e-5,
            bf16=True,   # instead of fp16
            fp16=False,  # make sure this is off
            dataloader_pin_memory=False,
            remove_unused_columns=False,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            report_to=None,  # Disable wandb/tensorboard
            save_total_limit=2
        )
        
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
        )
        
        trainer.train()
        
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)
        
        print(f"Training completed! Model saved to {output_dir}")
        
    def inference(self, question: str, model_path: str = None, max_length: int = 100):
        """Generate kubectl command from natural language question"""
        if model_path and not hasattr(self, 'model'):
            self.load_trained_model(model_path)
        
        prompt = f"### Instruction:\nTranslate the following natural language request to a Kubernetes kubectl command:\n\n### Input:\n{question}\n\n### Response:\n"
        
        inputs = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=inputs.shape[1] + max_length,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                logits_processor=[SafeLogitsProcessor()],
            )
        
        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        command = full_response.split("### Response:\n")[-1].strip()
        
        return command
    
    def load_trained_model(self, model_path: str):
        print(f"Loading trained model from {model_path}...")
    
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
    
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            device_map="cuda:0" if torch.cuda.is_available() else "cpu"
        )
    
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
    
        self.model.config.use_cache = True  # inference wants cache
        print("Model loaded successfully!")

    
    def batch_inference(self, questions: List[str], model_path: str = None):
        """Run inference on multiple questions"""
        if model_path:
            self.load_trained_model(model_path)
        
        results = []
        for question in questions:
            command = self.inference(question)
            results.append({
                'question': question,
                'generated_command': command
            })
        
        return results

def main():
    trainer = KubernetesCommandTrainer()
    
    csv_file = "kubernetes_commands.csv"  # Update this path
    questions, commands = trainer.load_data(csv_file)
    
    trainer.setup_model_and_tokenizer()
    
    train_dataset, val_dataset = trainer.prepare_datasets(questions, commands)
    
    trainer.train(train_dataset, val_dataset)
    
    print("\nTraining completed successfully!")
    
    test_questions = [
        "View the supported API versions",
        "Display information about the control plane and cluster services",
        "Print the list of supported API resources"
    ]
    
    print("\n" + "="*50)
    print("Testing inference:")
    print("="*50)
    
    for question in test_questions:
        command = trainer.inference(question)
        print(f"\nQuestion: {question}")
        print(f"Generated Command: {command}")

def inference_only_example():
    """Example of using a pre-trained model for inference only"""
    trainer = KubernetesCommandTrainer()
    
    model_path = "./k8s-command-model"
    trainer.load_trained_model(model_path)
    
    test_questions = [
        "How do I check the cluster information?",
        "Show me the API versions",
        "List all supported resources",
        "Build manifests from current directory"
    ]
    
    print("Inference Results:")
    print("="*50)
    
    results = trainer.batch_inference(test_questions)
    for result in results:
        print(f"\nQ: {result['question']}")
        print(f"A: {result['generated_command']}")

if __name__ == "__main__":
    main()
    

Using device: cuda
Loading data...
Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Model loaded with 3,212,749,824 parameters
Preparing datasets...
Training samples: 10000
Validation samples: 2000
Starting training...


Step,Training Loss,Validation Loss
50,2.2,2.3
100,1.85,1.9
150,1.6,1.65


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Training completed! Model saved to ./k8s-command-model

Training completed successfully!



In [23]:
# !python train.py

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.78it/s]
Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

--- Generated ---

[{'role': 'user', 'content': 'User question: Kubernetes command to Print the list of supported namespaced resources\n'}, {'role': 'assistant', 'content': 'kubectl api-resources'}]


In [6]:
!nvidia-smi

Wed Sep 24 16:37:33 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A40                     On  |   00000000:53:00.0 Off |                    0 |
|  0%   45C    P0            209W /  300W |   27707MiB /  46068MiB |     86%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
!ls

'k8s lora 23-9 Claude (3).ipynb'   k8s-command-model   kubernetes_commands.csv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
!ls

 Untitled.ipynb  'k8s lora 23-9 Claude.ipynb'   kubernetes_commands.csv
 inf.py		  k8s-command-model


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
PEFT_PATH = "/workspace/k8s-command-model/checkpoint-150"

def load_model():
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Load base model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
    )

    try:
        _ = torch.load(PEFT_PATH, map_location="cpu")
    except Exception as e:
        pass

    return tokenizer, model

def run_inference(prompt: str, max_new_tokens: int = 128):
    tokenizer, model = load_model()

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map="cuda:0",
    )

    outputs = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return outputs[0]["generated_text"]

if __name__ == "__main__":
    prompt = """### Instruction:
Translate the following natural language request to a Kubernetes kubectl command:

### Input:
List all pods in the default namespace

### Response:
"""
    result = run_inference(prompt)
    print(result)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


### Instruction:
Translate the following natural language request to a Kubernetes kubectl command:

### Input:
List all pods in the default namespace

### Response:
```bash
kubectl get pods --namespace=default
```



In [11]:
prompt = """### Instruction:
Translate the following natural language request to a Kubernetes kubectl command:

### Input:
Display detailed information about a specific pod

### Response:
"""
result = run_inference(prompt)
print(result)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


### Instruction:
Translate the following natural language request to a Kubernetes kubectl command:

### Input:
Display detailed information about a specific pod

### Response:
```bash
kubectl get pod <pod_name> -o wide
```


In [12]:
prompt = """### Instruction:
Translate the following natural language request to a Kubernetes kubectl command:

### Input:
View logs from the first container of a job named "batch-processing-job"

### Response:
"""
result = run_inference(prompt)
print(result)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


### Instruction:
Translate the following natural language request to a Kubernetes kubectl command:

### Input:
View logs from the first container of a job named "batch-processing-job"

### Response:
```
kubectl logs job/batch-processing-job -c 0
```



In [16]:
from huggingface_hub import HfApi, create_repo

# --- config ---
repo_id       = "tarun122/k8s-lora-final-2"        
folder_path   = "/workspace/k8s-command-model"  
repo_type     = "model"                  
private       = False                     
commit_msg    = "Upload folder"
path_in_repo  = ""                       

create_repo(repo_id, repo_type=repo_type, private=private, exist_ok=True)


api = HfApi()
api.upload_folder(
    repo_id=repo_id,
    repo_type=repo_type,
    folder_path=folder_path,
    path_in_repo=path_in_repo,        
    commit_message=commit_msg,
    # token=None                      
)
print("Done!")


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Done!


In [1]:
!ls -lstrh /workspace/k8s-command-model 

total 6.1G
2.9M drwxrwxrwx 2 root root 2.9M Sep 24 16:33 checkpoint-100
 512 -rw-rw-rw- 1 root root  184 Sep 24 16:36 generation_config.json
1.0K -rw-rw-rw- 1 root root  867 Sep 24 16:36 config.json
2.9M drwxrwxrwx 2 root root 2.9M Sep 24 16:36 checkpoint-150
4.7G -rw-rw-rw- 1 root root 4.7G Sep 24 16:37 model-00001-of-00002.safetensors
1.4G -rw-rw-rw- 1 root root 1.4G Sep 24 16:37 model-00002-of-00002.safetensors
6.0K -rw-rw-rw- 1 root root 5.7K Sep 24 16:37 training_args.bin
 50K -rw-rw-rw- 1 root root  50K Sep 24 16:37 tokenizer_config.json
 17M -rw-rw-rw- 1 root root  17M Sep 24 16:37 tokenizer.json
 512 -rw-rw-rw- 1 root root  325 Sep 24 16:37 special_tokens_map.json
 21K -rw-rw-rw- 1 root root  21K Sep 24 16:37 model.safetensors.index.json
4.0K -rw-rw-rw- 1 root root 3.8K Sep 24 16:37 chat_template.jinja


# Train 2

In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset as HFDataset
import os
from typing import Dict, List
import json
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

class KubernetesCommandDataset(Dataset):
    def __init__(self, questions: List[str], commands: List[str], tokenizer, max_length: int = 512):
        self.questions = questions
        self.commands = commands
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        question = self.questions[idx]
        command = self.commands[idx]
        
        # Format the input as instruction-following format with clear separators
        prompt = f"<|begin_of_text|>### Instruction:\nTranslate the following natural language request to a Kubernetes kubectl command:\n\n### Input:\n{question}\n\n### Response:\n{command}<|end_of_text|>"
        
        # Tokenize the prompt
        encoding = self.tokenizer(
            prompt,
            truncation=True,
            padding=False,  # Don't pad here, let the data collator handle it
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        # Create labels - mask the instruction part, only train on the response
        labels = input_ids.clone()
        
        # Find the start of the response section
        response_start_text = "### Response:\n"
        response_tokens = self.tokenizer.encode(response_start_text, add_special_tokens=False)
        
        # Find where the response starts in the input_ids
        for i in range(len(input_ids) - len(response_tokens) + 1):
            if input_ids[i:i+len(response_tokens)].tolist() == response_tokens:
                # Mask everything before the response (set to -100)
                labels[:i+len(response_tokens)] = -100
                break
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

class KubernetesCommandTrainer:
    def __init__(self, model_name: str = "meta-llama/Llama-3.1-8B-Instruct"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
    def load_data(self, csv_path: str):
        """Load and preprocess the CSV data"""
        print("Loading data...")
        df = pd.read_csv(csv_path)
        
        # Extract questions and commands
        questions = df['question'].tolist()
        commands = df['command'].tolist()
        
        # Remove any rows with NaN values
        valid_pairs = [(q, c) for q, c in zip(questions, commands) if pd.notna(q) and pd.notna(c)]
        questions, commands = zip(*valid_pairs)
        
        print(f"Loaded {len(questions)} training examples")
        return list(questions), list(commands)
    
    def setup_model_and_tokenizer(self):
        """Initialize the model and tokenizer"""
        print("Loading tokenizer and model...")
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True
        )
        
        # Set proper padding token (use a different token than EOS)
        if self.tokenizer.pad_token is None:
            # Try to use existing special tokens first
            if hasattr(self.tokenizer, 'unk_token') and self.tokenizer.unk_token:
                self.tokenizer.pad_token = self.tokenizer.unk_token
            else:
                # Add a new padding token
                self.tokenizer.add_special_tokens({'pad_token': '<PAD>'})
                
        # Load model
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,  # Use half precision to save memory
            device_map="auto",
            trust_remote_code=True
        )
        
        # Resize embeddings if we added new tokens
        if self.tokenizer.pad_token == '<PAD>':
            self.model.resize_token_embeddings(len(self.tokenizer))
        
        # Enable gradient checkpointing to save memory
        self.model.gradient_checkpointing_enable()
        self.model.config.use_cache = False 
        
        
        print(f"Model loaded with {self.model.num_parameters():,} parameters")
        print(f"Vocabulary size: {len(self.tokenizer)}")
        print(f"Pad token: {self.tokenizer.pad_token}")
        print(f"EOS token: {self.tokenizer.eos_token}")
        
    def prepare_datasets(self, questions: List[str], commands: List[str], test_size: float = 0.2):
        """Prepare train and validation datasets"""
        print("Preparing datasets...")
        
        # Split data
        train_q, val_q, train_c, val_c = train_test_split(
            questions, commands, test_size=test_size, random_state=42
        )
        
        # Create datasets
        train_dataset = KubernetesCommandDataset(train_q, train_c, self.tokenizer)
        val_dataset = KubernetesCommandDataset(val_q, val_c, self.tokenizer)
        
        print(f"Training samples: {len(train_dataset)}")
        print(f"Validation samples: {len(val_dataset)}")
        
        return train_dataset, val_dataset
    
    def train(self, train_dataset, val_dataset, output_dir: str = "./k8s-command-model"):
        """Train the model"""
        print("Starting training...")
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=1,  # Small batch size to fit in memory
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=8,  # Simulate larger batch size
            warmup_steps=100,
            logging_steps=10,
            eval_strategy="steps",
            eval_steps=50,
            save_steps=100,
            learning_rate=2e-5,
            fp16=True,  # Use mixed precision training
            dataloader_pin_memory=False,
            remove_unused_columns=False,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            report_to=None,  # Disable wandb/tensorboard
            save_total_limit=2,
        )
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
        )
        
        # Train the model
        trainer.train()
        
        # Save the final model
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)
        
        print(f"Training completed! Model saved to {output_dir}")
    def train_memory_efficient(self, train_dataset, val_dataset, output_dir: str = "./k8s-command-model"):
        """Memory-efficient training with proper FP16 handling"""
        print("Starting memory-efficient training...")
        
        # Training arguments with proper FP16 setup
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=1,  # Very small batch size
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=8,  # Simulate larger batches
            warmup_steps=50,
            logging_steps=5,
            eval_strategy="steps",
            eval_steps=25,
            save_steps=50,
            learning_rate=1e-5,
            fp16=True,  # Keep FP16 for memory savings
            fp16_full_eval=True,
            dataloader_pin_memory=False,
            remove_unused_columns=False,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            report_to=None,
            save_total_limit=2,
            # Remove max_grad_norm to avoid FP16 conflicts
            # max_grad_norm=1.0,  # Commented out
            warmup_ratio=0.1,
            optim="adamw_torch",
            adam_epsilon=1e-6,
            weight_decay=0.01,
            lr_scheduler_type="cosine",
        )
        
        # Custom data collator (same as before)
        class CustomDataCollator:
            def __init__(self, tokenizer):
                self.tokenizer = tokenizer
            
            def __call__(self, features):
                batch = {}
                max_length = max([len(f['input_ids']) for f in features])
                
                batch['input_ids'] = []
                batch['attention_mask'] = []
                batch['labels'] = []
                
                for feature in features:
                    input_ids = feature['input_ids']
                    attention_mask = feature['attention_mask']
                    labels = feature['labels']
                    
                    # Pad sequences
                    padding_length = max_length - len(input_ids)
                    
                    # Pad input_ids and attention_mask
                    padded_input_ids = torch.cat([
                        input_ids, 
                        torch.full((padding_length,), self.tokenizer.pad_token_id)
                    ])
                    padded_attention_mask = torch.cat([
                        attention_mask,
                        torch.zeros(padding_length)
                    ])
                    
                    # Pad labels (use -100 for padded positions)
                    padded_labels = torch.cat([
                        labels,
                        torch.full((padding_length,), -100)
                    ])
                    
                    batch['input_ids'].append(padded_input_ids)
                    batch['attention_mask'].append(padded_attention_mask)
                    batch['labels'].append(padded_labels)
                
                # Stack tensors
                batch['input_ids'] = torch.stack(batch['input_ids'])
                batch['attention_mask'] = torch.stack(batch['attention_mask'])
                batch['labels'] = torch.stack(batch['labels'])
                
                return batch
        
        data_collator = CustomDataCollator(self.tokenizer)
        
        # Custom Trainer class to handle gradient clipping manually
        class CustomTrainer(Trainer):
            def training_step(self, model, inputs):
                model.train()
                inputs = self._prepare_inputs(inputs)
                
                with self.compute_loss_context_manager():
                    loss = self.compute_loss(model, inputs)
                
                if self.args.n_gpu > 1:
                    loss = loss.mean()
                
                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps
                
                if self.use_apex:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    self.accelerator.backward(loss)
                
                return loss.detach() / self.args.gradient_accumulation_steps
        
        # Initialize trainer
        trainer = CustomTrainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
        )
        
        # Train the model
        trainer.train()
        
        # Save the final model
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)
        
        print(f"Training completed! Model saved to {output_dir}")
        
    def inference(self, question: str, model_path: str = None, max_length: int = 100):
        """Generate kubectl command from natural language question"""
        if model_path and not hasattr(self, 'model'):
            self.load_trained_model(model_path)
        
        prompt = f"### Instruction:\nTranslate the following natural language request to a Kubernetes kubectl command:\n\n### Input:\n{question}\n\n### Response:\n"
        
        # Tokenize input
        inputs = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
        
        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=inputs.shape[1] + max_length,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )
        
        # Decode and extract the command
        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        command = full_response.split("### Response:\n")[-1].strip()
        
        return command
    
    def load_trained_model(self, model_path: str):
        """Load a previously trained model"""
        print(f"Loading trained model from {model_path}...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print("Model loaded successfully!")
    
    def batch_inference(self, questions: List[str], model_path: str = None):
        """Run inference on multiple questions"""
        if model_path:
            self.load_trained_model(model_path)
        
        results = []
        for question in questions:
            command = self.inference(question)
            results.append({
                'question': question,
                'generated_command': command
            })
        
        return results

def main():
    # Initialize trainer
    trainer = KubernetesCommandTrainer()
    
    # Load data (replace with your CSV file path)
    csv_file = "kubernetes_commands.csv"  # Update this path
    questions, commands = trainer.load_data(csv_file)
    
    # Setup model and tokenizer
    trainer.setup_model_and_tokenizer()
    
    # Prepare datasets
    train_dataset, val_dataset = trainer.prepare_datasets(questions, commands)
    
    # Train the model
    trainer.train_memory_efficient(train_dataset, val_dataset)
    
    print("\nTraining completed successfully!")
    
    # Test inference
    test_questions = [
        "View the supported API versions",
        "Display information about the control plane and cluster services",
        "Print the list of supported API resources"
    ]
    
    print("\n" + "="*50)
    print("Testing inference:")
    print("="*50)
    
    for question in test_questions:
        command = trainer.inference(question)
        print(f"\nQuestion: {question}")
        print(f"Generated Command: {command}")

def inference_only_example():
    """Example of using a pre-trained model for inference only"""
    trainer = KubernetesCommandTrainer()
    
    # Load the trained model (update path as needed)
    model_path = "./k8s-command-model"
    trainer.load_trained_model(model_path)
    
    # Test questions
    test_questions = [
        "How do I check the cluster information?",
        "Show me the API versions",
        "List all supported resources",
        "Build manifests from current directory"
    ]
    
    print("Inference Results:")
    print("="*50)
    
    results = trainer.batch_inference(test_questions)
    for result in results:
        print(f"\nQ: {result['question']}")
        print(f"A: {result['generated_command']}")

if __name__ == "__main__":
    # For training
    main()
    
    # Uncomment below for inference only
    # inference_only_example()

Using device: cuda
Loading data...
Loading tokenizer and model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded with 8,030,269,440 parameters
Vocabulary size: 128257
Pad token: <PAD>
EOS token: <|eot_id|>
Preparing datasets...
Training samples: 399
Validation samples: 100
Starting memory-efficient training...


TypeError: KubernetesCommandTrainer.train_memory_efficient.<locals>.CustomTrainer.training_step() takes 3 positional arguments but 4 were given