In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/csvbashcommands/formatted_command_dataset.csv


In [23]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

model.config.use_cache = False

# Load your dataset
dataset = load_dataset('csv', data_files='/kaggle/input/csvbashcommands/formatted_command_dataset.csv')

In [24]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [25]:
def tokenize_function(examples):
    tokenized = tokenizer(examples['description'], padding='max_length', truncation=True, max_length=128)
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

In [26]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/481 [00:00<?, ? examples/s]

In [27]:
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [28]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
)

In [29]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

# Use CustomTrainer instead of Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'] if 'test' in tokenized_dataset else None,
)

In [30]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=93, training_loss=3.4743970645371305, metrics={'train_runtime': 40.2776, 'train_samples_per_second': 35.826, 'train_steps_per_second': 2.309, 'total_flos': 94230460366848.0, 'train_loss': 3.4743970645371305, 'epoch': 3.0})

In [31]:
model.save_pretrained("./fine_tuned_gpt_neo")
tokenizer.save_pretrained("./fine_tuned_gpt_neo")

('./fine_tuned_gpt_neo/tokenizer_config.json',
 './fine_tuned_gpt_neo/special_tokens_map.json',
 './fine_tuned_gpt_neo/vocab.json',
 './fine_tuned_gpt_neo/merges.txt',
 './fine_tuned_gpt_neo/added_tokens.json')

In [37]:
model_path = "./fine_tuned_gpt_neo"
model = GPTNeoForCausalLM.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [38]:
model.eval()

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [45]:
import torch
def generate_bash_command(prompt, max_length=50):
    full_prompt = f"Human: {prompt}\nBash command:"
    input_ids = tokenizer.encode(full_prompt, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True
        )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    bash_command = response.split("Bash command:")[-1].strip()
    return bash_command

In [None]:
print("Enter a natural language prompt to get a bash command. Type 'quit' to exit.")
while True:
    user_input = input("\nHuman: ")
    if user_input.lower() == 'quit':
        break
    
    bash_command = generate_bash_command(user_input)
    print(f"Bash command: {bash_command}")

Enter a natural language prompt to get a bash command. Type 'quit' to exit.



Human:  find me where 'text.txt' is on my computer


Bash command: --find <filename>

When I run the --with-find, I get no output:
[1] -find -v <file



Human:  open up the app vscode


Bash command: bash -c 'export PATH /path/to/the/script/file_name'



Human:  create a new folder on the desktop


Bash command: chmod 127.0.2.1 /mnt
