In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:

import os
import torch
import argparse
import numpy as np
from datetime import datetime
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import wandb
import json



In [3]:
# Parse arguments for customization
parser = argparse.ArgumentParser(description="Fine-tune a language model for text generation")
parser.add_argument("--base_model", type=str, default="gpt2", help="Base model to fine-tune (default: gpt2)")
parser.add_argument("--model_name", type=str, default="uzmi", help="Name for your fine-tuned model")
parser.add_argument("--dataset_name", type=str, default="wikitext", help="Dataset to use from Hugging Face")
parser.add_argument("--dataset_config", type=str, default="wikitext-2-raw-v1", help="Dataset configuration")
parser.add_argument("--dataset_split", type=str, default="train[:10%]", help="Dataset split percentage")
parser.add_argument("--custom_data_path", type=str, default=None, help="Path to custom text files (optional)")
parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
parser.add_argument("--batch_size", type=int, default=8, help="Training batch size")
parser.add_argument("--seq_length", type=int, default=256, help="Maximum sequence length")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate")
parser.add_argument("--use_wandb", action="store_true", help="Enable Weights & Biases logging")
parser.add_argument("--gradient_accumulation", type=int, default=1, help="Gradient accumulation steps")
parser.add_argument("--fp16", action="store_true", help="Enable mixed precision training")
parser.add_argument("--early_stopping", action="store_true", help="Enable early stopping")
parser.add_argument("--early_stopping_patience", type=int, default=3, help="Early stopping patience")
parser.add_argument("--warmup_steps", type=int, default=500, help="Warmup steps for learning rate scheduler")
parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X steps")
parser.add_argument("--eval_steps", type=int, default=500, help="Evaluate every X steps")
parser.add_argument("--test_size", type=float, default=0.1, help="Test set size (0-1)")

if __name__ == "__main__":
    args, _ = parser.parse_known_args()
else:
    args, _ = parser.parse_known_args()

In [4]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = args.model_name
save_path = f"/content/drive/MyDrive/{model_name}_{timestamp}"
logs_path = f"{save_path}/logs"
checkpoints_path = f"{save_path}/checkpoints"
for path in [save_path, logs_path, checkpoints_path]:
    if not os.path.exists(path):
        os.makedirs(path)

print(f"Model will be saved to: {save_path}")

Model will be saved to: /content/drive/MyDrive/uzmi_20250513_165055


In [5]:

if args.use_wandb:
    print("Initializing Weights & Biases...")
    wandb.init(
        project=f"{model_name}-training",
        name=f"{model_name}_{timestamp}",
        config=vars(args)
    )


In [6]:
print(f"Loading {args.base_model} tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(args.base_model)
model = AutoModelForCausalLM.from_pretrained(args.base_model)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))


Loading gpt2 tokenizer and model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embedding(50257, 768)

In [7]:
pip install -U datasets huggingface_hub fsspec


Collecting fsspec
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)


In [8]:
# Load dataset
print("Loading dataset...")
if args.custom_data_path:
    # Load custom data from text files
    text_files = [os.path.join(args.custom_data_path, f) for f in os.listdir(args.custom_data_path) if f.endswith('.txt')]
    texts = []
    for file_path in tqdm(text_files, desc="Reading text files"):
        with open(file_path, 'r', encoding='utf-8') as f:
            texts.append(f.read())
    dataset = Dataset.from_dict({"text": texts})
    # Split dataset into train and validation
    train_texts, val_texts = train_test_split(texts, test_size=args.test_size, random_state=42)
    train_dataset = Dataset.from_dict({"text": train_texts})
    val_dataset = Dataset.from_dict({"text": val_texts})
else:
    try:

        config = args.dataset_config if args.dataset_config else None
        dataset = load_dataset(
            args.dataset_name,
            config,
            split=args.dataset_split
        )
        if isinstance(dataset, Dataset):
            train_val = dataset.train_test_split(test_size=args.test_size, seed=42)
            train_dataset = train_val["train"]
            val_dataset = train_val["test"]
        else:
            train_dataset = dataset["train"]
            val_dataset = dataset["validation"] if "validation" in dataset else dataset["test"]
    except ValueError as e:
        if "Invalid pattern: '**'" in str(e):
            print("ERROR: Dataset loading failed due to '**' pattern. "
                  "Please upgrade your packages with:\n"
                  "!pip install -U datasets huggingface_hub fsspec\n"
                  "Then restart your runtime and rerun this cell.")
            raise
        else:
            raise

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Loading dataset...


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Train dataset size: 3304
Validation dataset size: 368


In [9]:
def preprocess_function(examples):
    """Tokenize the texts and prepare them for language modeling"""
    outputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=args.seq_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == args.seq_length:
            input_batch.append(input_ids)
    result = {"input_ids": input_batch}
    result["attention_mask"] = [
        [1] * len(input_ids) for input_ids in result["input_ids"]
    ]
    result["labels"] = result["input_ids"].copy()
    return result


In [10]:
print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train dataset",
    num_proc=4
)

tokenized_val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation dataset",
    num_proc=4
)


Tokenizing datasets...


Tokenizing train dataset (num_proc=4):   0%|          | 0/3304 [00:00<?, ? examples/s]

Tokenizing validation dataset (num_proc=4):   0%|          | 0/368 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


In [12]:
callbacks = []
if args.early_stopping:
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=args.early_stopping_patience
    )
    callbacks.append(early_stopping_callback)


In [13]:
training_args = TrainingArguments(
    output_dir=checkpoints_path,
    overwrite_output_dir=True,
    num_train_epochs=args.epochs,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    gradient_accumulation_steps=args.gradient_accumulation,
    eval_strategy="steps",
    eval_steps=args.eval_steps,
    logging_dir=logs_path,
    logging_steps=100,
    save_strategy="steps",
    save_steps=args.save_steps,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=args.learning_rate,
    warmup_steps=args.warmup_steps,
    weight_decay=0.01,
    fp16=args.fp16 or torch.cuda.is_available(),
    report_to="wandb" if args.use_wandb else "none",
    run_name=f"{model_name}_{timestamp}" if args.use_wandb else None,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    callbacks=callbacks,
)


In [15]:

print("Starting training...")
trainer.train()


Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=63, training_loss=3.935485355437748, metrics={'train_runtime': 1909.5254, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.033, 'total_flos': 65845592064000.0, 'train_loss': 3.935485355437748, 'epoch': 3.0})

In [16]:
print("Evaluating final model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluating final model...


Evaluation results: {'eval_loss': 3.641033411026001, 'eval_runtime': 22.1064, 'eval_samples_per_second': 0.859, 'eval_steps_per_second': 0.136, 'epoch': 3.0}


In [17]:
print(f"Saving model to {save_path}...")
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

Saving model to /content/drive/MyDrive/uzmi_20250513_165055...


('/content/drive/MyDrive/uzmi_20250513_165055/tokenizer_config.json',
 '/content/drive/MyDrive/uzmi_20250513_165055/special_tokens_map.json',
 '/content/drive/MyDrive/uzmi_20250513_165055/vocab.json',
 '/content/drive/MyDrive/uzmi_20250513_165055/merges.txt',
 '/content/drive/MyDrive/uzmi_20250513_165055/added_tokens.json',
 '/content/drive/MyDrive/uzmi_20250513_165055/tokenizer.json')

In [21]:
def generate_samples(model, tokenizer, prompts, max_length=50):
    """Generate text samples from the model using given prompts"""
    model.eval()
    generated_texts = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id
        )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_texts.append(generated_text)
    return generated_texts

test_prompts = [
    "The future of artificial intelligence is",
    "In a world where technology dominates,",
    "The most important thing of love is",
    "Who is Prime Minister of India ?"
]

print("\nGenerating sample outputs...")
generated_samples = generate_samples(model, tokenizer, test_prompts)

print("\n===== SAMPLE GENERATIONS =====")
for prompt, generated in zip(test_prompts, generated_samples):
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {generated}")
    print("-" * 50)



Generating sample outputs...

===== SAMPLE GENERATIONS =====

Prompt: The future of artificial intelligence is
Generated: The future of artificial intelligence is in the hands of the next generation of experts at Google. The company is currently working on developing a new technology to improve its AI algorithms and machine learning algorithms. Google recently announced a partnership with Facebook to create artificial neural networks
--------------------------------------------------

Prompt: In a world where technology dominates,
Generated: In a world where technology dominates, it is unlikely that we will ever see the best of them. It is a challenge, however, to find a way to make our own devices work.

It has been said that Samsung's next generation of
--------------------------------------------------

Prompt: The most important thing of love is
Generated: The most important thing of love is the ability to love your loved one. Love your brother, and love the person in the way you 

In [22]:
inference_config = {
    "max_length": 100,
    "temperature": 0.8,
    "top_k": 50,
    "top_p": 0.95,
    "no_repeat_ngram_size": 2,
    "num_beams": 1,
    "do_sample": True
}

with open(f"{save_path}/inference_config.json", "w") as f:
    json.dump(inference_config, f, indent=2)


In [23]:
from huggingface_hub import login

login(token="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")


In [24]:
!pip install -U transformers




In [28]:
from huggingface_hub import create_repo, upload_folder
hf_token = "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"
repo_id = "rajan3208/uzmi-gpt"

create_repo(repo_id, token=hf_token, exist_ok=True)
upload_folder(
    folder_path="/content/drive/MyDrive/uzmi_20250513_165055",
    repo_id=repo_id,
    repo_type="model",
    token=hf_token
)

rng_state.pth:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rajan3208/uzmi-gpt/commit/a4eed9e8f3bd8f1a432379ee134c95dd4bad401c', commit_message='Upload folder using huggingface_hub', commit_description='', oid='a4eed9e8f3bd8f1a432379ee134c95dd4bad401c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rajan3208/uzmi-gpt', endpoint='https://huggingface.co', repo_type='model', repo_id='rajan3208/uzmi-gpt'), pr_revision=None, pr_num=None)

In [29]:
# Iss code se hum test kar rahe hum apne model ko using its api
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "rajan3208/uzmi-gpt"

# Yahan par hum Load karenge tokenizer aur model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# to Generate text
inputs = tokenizer("Once upon a time", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


tokenizer_config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, the world was filled with the sounds of the distant thunder of the moon, and the sounds of the distant thunder of the sun. The earth was filled with the sounds of the distant thunder of the moon, and the earth was filled with the sounds
