
Step	Purpose	                        Key Function/Module

1	    Install dependencies	        pip
2	    Load environment variables	    dotenv
3	    Authenticate	                huggingface_hub.login
4	    Load model/tokenizer	       transformers.AutoModelForCausalLM, AutoTokenizer
5	    Test generation	                model.generate
6	    Load dataset	                datasets.load_dataset
7	    Tokenize data	                dataset.map
8	    Data collator	                DataCollatorForLanguageModeling
9	    Apply LoRA	                    peft.get_peft_model, LoraConfig
10	    Training arguments	            TrainingArguments
11	    Trainer setup/train	            Trainer
12	    Save model/tokenizer	        trainer.save_model, tokenizer.save_pretrained
13	    Reload and use	                AutoModelForCausalLM.from_pretrained

### 1. Install Required Libraries

In [41]:
#1. Install Required Libraries

!pip install datasets pandas torch transformers[torch] python-dotenv peft



### 2. Environment Setup and Authentication

In [42]:
# 2. Environment Setup and Authentication

import os
from dotenv import load_dotenv
load_dotenv()  # Load the .env file
hf_token = os.getenv("HF_TOKEN")

### 3. Login to Hugging Face Hub

In [43]:
# 3. Login to Hugging Face Hub

from huggingface_hub import login
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


### 4. Load Pretrained Model and Tokenizer

In [44]:
# 4. Load Pretrained Model and Tokenizer

from transformers import AutoModelForCausalLM, AutoTokenizer
import os

model_name = "distilgpt2"
save_dir = os.path.join("HFModels", model_name)
os.makedirs(save_dir, exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(model_name)

# Save model and tokenizer locally
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### 5. Test Model Generation

In [55]:
# 5. Test Model Generation using locally saved model

from transformers import AutoModelForCausalLM, AutoTokenizer
import os

model_name = "distilgpt2"
save_dir = os.path.join("HFModels", model_name)

# Load model and tokenizer from local directory
tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForCausalLM.from_pretrained(save_dir)

text = "ஒரு நாள் "
inputs = tokenizer(text, return_tensors="pt")
# Generate story
output = model.generate(inputs.input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ஒரு நாள் நாள் நாள் நாள் நாள் நாள் நாள் நாள் நாள


### 6. Load and Prepare Dataset

In [46]:
# 6. Load and Prepare Dataset

from datasets import load_dataset, Dataset, load_from_disk
import os

dataset_name = "tniranjan/aitamilnadu_tamil_stories_no_instruct"
local_dataset_dir = os.path.join("HFDataset", "aitamilnadu_tamil_stories_no_instruct")
os.makedirs(local_dataset_dir, exist_ok=True)
local_dataset_file = os.path.join(local_dataset_dir, "train-1000.arrow")

# Check if local dataset file exists
if os.path.exists(local_dataset_file):
    print(f"Loading dataset from local file: {local_dataset_file}")
    raw_data = load_from_disk(local_dataset_dir)
else:
    print(f"Downloading dataset: {dataset_name}")
    raw_data = load_dataset(dataset_name, split="train[:1000]")
    raw_data.save_to_disk(local_dataset_dir)

# Split the dataset
data = raw_data.train_test_split(train_size=0.95)


Downloading dataset: tniranjan/aitamilnadu_tamil_stories_no_instruct


Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 47576.58 examples/s]


### 7. Tokenize Dataset

In [47]:
# 7. Tokenize Dataset

tokenizer.pad_token = tokenizer.eos_token

def preprocess_batch(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=200)

tokenized_dataset = data.map(
    preprocess_batch,
    batched=True,
    batch_size=4,
    remove_columns=data["train"].column_names
)
print(tokenized_dataset)

Map: 100%|██████████| 950/950 [00:07<00:00, 127.45 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 108.26 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 950
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 50
    })
})





### 8. Data Collator for Language Modeling


In [48]:
# 8. Data Collator for Language Modeling

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='HFModels\distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
), mlm=False, mlm_probability=0.15, mask_replace_prob=0.8, random_replace_prob=0.1, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt', seed=None)

### 9. Apply LoRA (Parameter-Efficient Fine-Tuning, Optional)

In [49]:
# 9. Apply LoRA (Parameter-Efficient Fine-Tuning, Optional)

from peft import get_peft_model, LoraConfig, TaskType
# Define LoRA Configuration
lora_config = LoraConfig(
                            r=8,  # Rank of LoRA matrices (adjust for speed/memory tradeoff)
                            lora_alpha=32,  # Scaling factor
                            lora_dropout=0.1,  # Dropout for stability
                            bias="none",  # No extra bias parameters
                            task_type=TaskType.CAUSAL_LM  # Since we're fine-tuning GPT-style models
                        )
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
# Print trainable parameters
model.print_trainable_parameters()

model.train() #This ensures layers like Dropout & BatchNorm are active during training.

trainable params: 147,456 || all params: 82,060,032 || trainable%: 0.1797




PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
             

### 10. Set Up Training Arguments

In [50]:
# 10. Set Up Training Arguments

from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
                                    output_dir="./output",
                                    evaluation_strategy="epoch",
                                    #save_strategy="epoch",
                                    save_steps=500,
                                    learning_rate=1e-5,
                                    weight_decay=0.01,
                                    num_train_epochs=3,
                                    per_device_train_batch_size=2,    # Batch size (adjust for GPU memory)
                                    per_device_eval_batch_size=2,
                                    logging_steps=50,
                                    logging_dir="./logs",
                                    resume_from_checkpoint=True
                                )



### 11. Initialize Trainer and Start Training

In [51]:
# 11. Initialize Trainer and Start Training

from transformers import Trainer
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=training_args,
    optimizers=(optimizer, None),
    data_collator=data_collator
)
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.3281,1.238758
2,1.2771,1.186174
3,1.2654,1.173945


TrainOutput(global_step=1425, training_loss=1.3180925897966351, metrics={'train_runtime': 2315.1557, 'train_samples_per_second': 1.231, 'train_steps_per_second': 0.616, 'total_flos': 145952686080000.0, 'train_loss': 1.3180925897966351, 'epoch': 3.0})

### 12. Save the Fine-Tuned Model and Tokenizer Locally

In [52]:
# 12. Save the Fine-Tuned Model and Tokenizer Locally

from transformers import AutoTokenizer, AutoModelForCausalLM
finetuned_dir = os.path.join("HFFinetunedModel", "fine_tuned_distilgpt2_Tamil")
os.makedirs(finetuned_dir, exist_ok=True)
trainer.save_model(finetuned_dir)
tokenizer.save_pretrained(finetuned_dir)

('HFFinetunedModel\\fine_tuned_distilgpt2_Tamil\\tokenizer_config.json',
 'HFFinetunedModel\\fine_tuned_distilgpt2_Tamil\\special_tokens_map.json',
 'HFFinetunedModel\\fine_tuned_distilgpt2_Tamil\\vocab.json',
 'HFFinetunedModel\\fine_tuned_distilgpt2_Tamil\\merges.txt',
 'HFFinetunedModel\\fine_tuned_distilgpt2_Tamil\\added_tokens.json',
 'HFFinetunedModel\\fine_tuned_distilgpt2_Tamil\\tokenizer.json')

### 13. Load and Use the Fine-Tuned Model

In [56]:
# 13. Load and Use the Fine-Tuned Model

from transformers import AutoModelForCausalLM, AutoTokenizer
import os

finetuned_dir = os.path.join("HFFinetunedModel", "fine_tuned_distilgpt2_Tamil")

# Load the fine-tuned model and tokenizer from local directory
tokenizer = AutoTokenizer.from_pretrained(finetuned_dir)
model = AutoModelForCausalLM.from_pretrained(finetuned_dir)

text = "ஒரு நாள்"
inputs = tokenizer(text, return_tensors="pt")
# Generate story
output = model.generate(inputs.input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ஒரு நாள்ரு நாிரு நாிரு நாிரு நாிரு நாிரு நாிர�


### 14. Training the model once again

In [54]:
# 14. Fine-tune the model again and save with incremented version

import os
from transformers import TrainingArguments, Trainer
from torch.optim import AdamW

# Find the next available version number for saving
base_dir = os.path.join("HFFinetunedModel", "fine_tuned_distilgpt2_Tamil")
version = 1
while os.path.exists(f"{base_dir}_v{version}"):
    version += 1
finetuned_dir = f"{base_dir}_v{version}"
os.makedirs(finetuned_dir, exist_ok=True)

model.train()  # Ensure model is in training mode
optimizer = AdamW(model.parameters(), lr=1e-5)

training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy="epoch",
    save_steps=500,
    learning_rate=1e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=50,
    logging_dir="./logs",
    resume_from_checkpoint=True
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=training_args,
    optimizers=(optimizer, None),
    data_collator=data_collator
)
trainer.train()

# Save the new fine-tuned model and tokenizer with version
trainer.save_model(finetuned_dir)
tokenizer.save_pretrained(finetuned_dir)



RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn