In [2]:
from datasets import load_dataset

dataset = load_dataset("aelhalili/bash-commands-dataset")
print(dataset)
print(dataset['train'][0]) # check first record
print(dataset['train'][839]) # check last record


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 840
    })
})
{'prompt': 'Move a file called x from the Desktop to the Downloads directory', 'response': 'mv ~/Desktop/x ~/Downloads/'}
{'prompt': 'Check the status of the Apache service', 'response': 'sudo systemctl status apache2'}


In [3]:
#Preprocessing for Auto-completion

def make_autocomplete_pairs(example):
    text=example["prompt"]
    words=text.split()
    pairs=[]
    for i in range(1,len(words)):
        partial=" ".join(words[:i])
        pairs.append({"input_text":partial,"target_text":text})

    return pairs
# Expand dataset 
from itertools import chain 
def expand_dataset(dataset): 
    expanded = list(chain.from_iterable(make_autocomplete_pairs(ex) for ex in dataset)) 
    return expanded 
train_data = expand_dataset(dataset["train"]) 
print(train_data[:2]) 

[{'input_text': 'Move', 'target_text': 'Move a file called x from the Desktop to the Downloads directory'}, {'input_text': 'Move a', 'target_text': 'Move a file called x from the Desktop to the Downloads directory'}]


In [4]:
from datasets import Dataset

ds = Dataset.from_list(train_data).train_test_split(test_size=0.1, seed=42)


In [5]:
from transformers import AutoTokenizer

model_name = "facebook/bart-base"   # you can try bart-large later
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_src_len = 64
max_tgt_len = 128

def preprocess(batch):
    inputs = tokenizer(
        batch["input_text"],
        max_length=max_src_len,
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_text"],
            max_length=max_tgt_len,
            truncation=True
        )
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)


Map: 100%|██████████| 4678/4678 [00:00<00:00, 17801.01 examples/s]
Map: 100%|██████████| 520/520 [00:00<00:00, 34950.29 examples/s]


In [6]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)


model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

args = TrainingArguments(
    output_dir="bart-prompt-autocomplete",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    eval_strategy="steps",   # ✅ required since you use eval_steps
    eval_steps=500,
    save_steps=500,
    logging_steps=100,    # ✅ available in v4.55.2
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.5948,0.415658
1000,0.3774,0.29175
1500,0.2746,0.235346




TrainOutput(global_step=1755, training_loss=0.5296675739125308, metrics={'train_runtime': 207.9591, 'train_samples_per_second': 67.484, 'train_steps_per_second': 8.439, 'total_flos': 104354251223040.0, 'train_loss': 0.5296675739125308, 'epoch': 3.0})

In [7]:
trainer.save_model("bart_autocomplete_model")
tokenizer.save_pretrained("bart_autocomplete_model")


('bart_autocomplete_model\\tokenizer_config.json',
 'bart_autocomplete_model\\special_tokens_map.json',
 'bart_autocomplete_model\\vocab.json',
 'bart_autocomplete_model\\merges.txt',
 'bart_autocomplete_model\\added_tokens.json',
 'bart_autocomplete_model\\tokenizer.json')

In [9]:
text = "list all"   # partial input

inputs = tokenizer(text, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_length=64,
    num_beams=5,
    early_stopping=True
)

print("Prediction:", tokenizer.decode(outputs[0], skip_special_tokens=True))


Prediction: list all files in the current directory


In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load trained model
tokenizer = AutoTokenizer.from_pretrained("bart_autocomplete_model")
model = AutoModelForSeq2SeqLM.from_pretrained("bart_autocomplete_model").to("cuda" if torch.cuda.is_available() else "cpu")

def get_autocomplete_suggestion(prefix: str) -> str:
    inputs = tokenizer(prefix, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=64,
        num_beams=5,
        early_stopping=True
    )
    suggestion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return suggestion
