# step 1 : data collection #

In [26]:
from datasets import load_dataset

# Load the dataset from the text files
dataset = load_dataset('text', data_files={'train': 'C:/Users/katta/VS CODE/LLMmodel/LLM/model-folder/greetings.txt', 
                                           'test': 'C:/Users/katta/VS CODE/LLMmodel/LLM/greetings_test.txt'})


Generating train split: 50 examples [00:00, 1545.54 examples/s]
Generating test split: 5 examples [00:00, 500.04 examples/s]


In [27]:
# Print a sample from the training set
print(dataset['train'][0])
# optional

{'text': 'Hello, how are you today?'}


# step 2 : load model #

In [3]:
'''
no need of running code when you have a pretrained model in your system 
'''
# load model
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [28]:
tokenizer.pad_token = tokenizer.eos_token 
model.resize_token_embeddings(len(tokenizer))  #we are kinda testing here

Embedding(50257, 768)

# step 3 : tokenizing data #

In [31]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [32]:
tokenized_datasets = dataset.map(tokenize_function, batched=True) # mapping the data to tokenize_function

Map: 100%|██████████| 50/50 [00:00<00:00, 283.52 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 156.55 examples/s]


In [33]:
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

In [34]:
# Print a sample from the tokenized training set
print(tokenized_datasets['train'][0])

{'input_ids': tensor([15496,    11,   703,  ..., 50256, 50256, 50256]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}


# step 3 : defining training Arguments #

In [35]:
from transformers import Trainer , TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Output directory for model predictions and checkpoints
    eval_strategy="epoch",             # Evaluation strategy to run at the end of each epoch
    learning_rate=5e-5,          # Learning rate
    per_device_train_batch_size=4,      # Batch size for training
    per_device_eval_batch_size=4,      # Batch size for evaluation
    num_train_epochs=7,        # Number of training epochs, i did 3 cause, i dont wanna waste my computational power: if it is finalised im gonna increase it
    weight_decay=0.01,       # Weight decay to avoid overfitting
    save_total_limit=2,        # Limit the total amount of checkpoints
    logging_dir='./logs',        # Directory for storing logs
)



# step 4 assigning the trainer #

In [36]:
from transformers import DataCollatorForLanguageModeling

# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False for causal language modeling (e.g., GPT-2)
)
# Initialize the Trainer with the data collator
trainer = Trainer(
    model=model,                        # The instantiated Hf Transformers model to be trained
    args=training_args,                 # Training arguments
    train_dataset=tokenized_datasets["train"],  # Training dataset
    eval_dataset=tokenized_datasets["test"],    # Evaluation dataset
    data_collator=data_collator         # Data collator
)

# step 5 : fine tuning the model #

In [1]:
# Fine-tune the model
trainer.train()

NameError: name 'trainer' is not defined

# step 6 : saving the model #

In [15]:
#saving the model to folder
model.save_pretrained("./model-folder")
tokenizer.save_pretrained("./model-folder")

('./model-folder\\tokenizer_config.json',
 './model-folder\\special_tokens_map.json',
 './model-folder\\vocab.json',
 './model-folder\\merges.txt',
 './model-folder\\added_tokens.json',
 './model-folder\\tokenizer.json')

# run this instead of step 2 if you are finetuning #

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Path to the saved model directory on your local machine
model_directory = "C:\\Users\\katta\\VS CODE\\LLMmodel\\LLM\\model-folder"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModelForCausalLM.from_pretrained(model_directory)