In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/csvbashcommands/formatted_command_dataset.csv


In [23]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

model.config.use_cache = False

# Load your dataset
dataset = load_dataset('csv', data_files='/kaggle/input/csvbashcommands/formatted_command_dataset.csv')

In [24]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [25]:
def tokenize_function(examples):
    tokenized = tokenizer(examples['description'], padding='max_length', truncation=True, max_length=128)
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

In [26]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/481 [00:00<?, ? examples/s]

In [27]:
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [28]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
)

In [29]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

# Use CustomTrainer instead of Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'] if 'test' in tokenized_dataset else None,
)

In [30]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=93, training_loss=3.4743970645371305, metrics={'train_runtime': 40.2776, 'train_samples_per_second': 35.826, 'train_steps_per_second': 2.309, 'total_flos': 94230460366848.0, 'train_loss': 3.4743970645371305, 'epoch': 3.0})

In [31]:
model.save_pretrained("./fine_tuned_gpt_neo")
tokenizer.save_pretrained("./fine_tuned_gpt_neo")

('./fine_tuned_gpt_neo/tokenizer_config.json',
 './fine_tuned_gpt_neo/special_tokens_map.json',
 './fine_tuned_gpt_neo/vocab.json',
 './fine_tuned_gpt_neo/merges.txt',
 './fine_tuned_gpt_neo/added_tokens.json')

In [32]:
!pip install huggingface_hub

  pid, fd = os.forkpty()




In [33]:
from huggingface_hub import HfApi, Repository
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
model = GPTNeoForCausalLM.from_pretrained("./fine_tuned_gpt_neo")
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt_neo")

In [35]:
repo_name = "sambhav11/gpt-neo-bash-commands"  # Change this to your desired name
api = HfApi()
repo_url = api.create_repo(repo_name, exist_ok=True)

In [36]:
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sambhav11/gpt-neo-bash-commands/commit/dbdb531767918ab63b3465475f6597e473d05ff9', commit_message='Upload tokenizer', commit_description='', oid='dbdb531767918ab63b3465475f6597e473d05ff9', pr_url=None, pr_revision=None, pr_num=None)