In [11]:
import os 
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
hf_token = os.getenv("HUGGING_FACE_API_KEY")
login(token=hf_token)

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import GPT2LMHeadModel

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

In [13]:
import pandas as pd

# https://huggingface.co/datasets/tniranjan/aitamilnadu_tamil_stories_no_instruct?library=datasets
# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/tniranjan/aitamilnadu_tamil_stories_no_instruct/" + splits["train"])

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, train_size=0.90, random_state=42)

In [14]:
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [15]:
from datasets import Dataset

tokenizer.pad_token = tokenizer.eos_token
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 972/972 [00:02<00:00, 354.07 examples/s]
Map: 100%|██████████| 109/109 [00:00<00:00, 353.70 examples/s]


In [16]:
for name, module in model.named_modules():
    if "proj" in name or "query" in name or "value" in name:
        print(name)


transformer.h.0.attn.c_proj
transformer.h.0.mlp.c_proj
transformer.h.1.attn.c_proj
transformer.h.1.mlp.c_proj
transformer.h.2.attn.c_proj
transformer.h.2.mlp.c_proj
transformer.h.3.attn.c_proj
transformer.h.3.mlp.c_proj
transformer.h.4.attn.c_proj
transformer.h.4.mlp.c_proj
transformer.h.5.attn.c_proj
transformer.h.5.mlp.c_proj


In [17]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_proj"]
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 258,048 || all params: 82,170,624 || trainable%: 0.3140




In [18]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    num_train_epochs=3,
    learning_rate=1e-5,
    logging_dir="./logs",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),

)

model.tie_weights()
trainer.train()
trainer.save_model("CustomeModel(DistilGPT2)")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.5446,1.452523
2,1.5182,1.413943
3,1.4875,1.402632
