In [None]:
from huggingface_hub import login

# Replace 'your_token_here' with your actual Hugging Face token
login(token="")

In [None]:
!pip install datasets transformers scikit-learn huggingface_hub --quiet

In [None]:
! pip install -U datasets huggingface_hub fsspec


Collecting fsspec
  Using cached fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [None]:
from datasets import load_dataset

# Correct language filter: use lang= not name=
dataset = load_dataset("uonlp/CulturaX", name="ur", split="train", streaming=True)

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Save 500K rows to Google Drive
import json

save_path = "/content/drive/MyDrive/CulturaX_Urdu_1000K.jsonl"

with open(save_path, "w", encoding="utf-8") as f:
    for i, example in enumerate(dataset):
        json.dump(example, f, ensure_ascii=False)
        f.write("\n")
        if i >= 999_999:
            break

print(f"Saved 500K Urdu examples to {save_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved 500K Urdu examples to /content/drive/MyDrive/CulturaX_Urdu_500K.jsonl


In [None]:
!pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=4811d8739adebd2806a86fc7c692b5449e2e3b9ce592abbe048aa84a385164a1
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
import json
from langdetect import detect
from tqdm import tqdm

input_path = "/content/drive/MyDrive/CulturaX_Urdu_1000K.jsonl"
output_path = "/content/drive/MyDrive/CulturaX_Urdu_Cleaned.txt"

cleaned_lines = []

with open(input_path, "r", encoding="utf-8") as infile:
    for line in tqdm(infile, total=999_999):
        obj = json.loads(line)
        text = obj["text"].strip().replace("\n", " ")
        try:
            if detect(text) == "ur":
                cleaned_lines.append(text)
        except:
            continue

# Save cleaned data
with open(output_path, "w", encoding="utf-8") as outfile:
    for line in cleaned_lines:
        outfile.write(line + "\n")

print(f"Cleaned and saved {len(cleaned_lines)} lines to: {output_path}")


10000it [02:25, 68.94it/s]


Cleaned and saved 10000 lines to: /content/drive/MyDrive/CulturaX_Urdu_Cleaned.txt


In [None]:
!pip install tokenizers

In [None]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=output_path, vocab_size=300_000, min_frequency=2, special_tokens=[
    "<s>", "<pad>", "</s>", "<unk>", "<mask>"
])

tokenizer.save_model("/content/urdu_tokenizer")


['/content/urdu_tokenizer/vocab.json', '/content/urdu_tokenizer/merges.txt']

In [None]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast(
    vocab_file="/content/urdu_tokenizer/vocab.json",
    merges_file="/content/urdu_tokenizer/merges.txt",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<s>",
    sep_token="</s>",
    mask_token="<mask>"
)


In [None]:
tokenizer.save_pretrained("/content/urdu_tokenizer")

('/content/urdu_tokenizer/tokenizer_config.json',
 '/content/urdu_tokenizer/special_tokens_map.json',
 '/content/urdu_tokenizer/vocab.json',
 '/content/urdu_tokenizer/merges.txt',
 '/content/urdu_tokenizer/added_tokens.json',
 '/content/urdu_tokenizer/tokenizer.json')

In [None]:
from datasets import load_dataset, Dataset

# Convert cleaned text to Hugging Face dataset
urdu_lines = [{"text": line} for line in cleaned_lines]
ds = Dataset.from_list(urdu_lines)

def tokenize(example):
    tokenized = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()  # important!
    return tokenized

tokenized_ds = ds.map(tokenize, batched=True)
tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments

config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=4,
    n_head=4,
    bos_token_id=tokenizer.cls_token_id,
    eos_token_id=tokenizer.sep_token_id
)

model = GPT2LMHeadModel(config)

training_args = TrainingArguments(
    output_dir="/content/urdu_llm",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    logging_dir="/content/logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_ds
)

trainer.train()

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,6.368
200,5.8673
300,5.6116
400,5.47
500,5.309
600,5.2269
700,5.1261
800,5.0447
900,4.9858
1000,4.9442


TrainOutput(global_step=3750, training_loss=4.822562963867187, metrics={'train_runtime': 2373.7524, 'train_samples_per_second': 12.638, 'train_steps_per_second': 1.58, 'total_flos': 72796078080000.0, 'train_loss': 4.822562963867187, 'epoch': 3.0})

In [None]:
model.save_pretrained("/content/urdu_llm")
tokenizer.save_pretrained("/content/urdu_llm")


('/content/urdu_llm/tokenizer_config.json',
 '/content/urdu_llm/special_tokens_map.json',
 '/content/urdu_llm/vocab.json',
 '/content/urdu_llm/merges.txt',
 '/content/urdu_llm/added_tokens.json',
 '/content/urdu_llm/tokenizer.json')

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

# Load from saved model directory
model = GPT2LMHeadModel.from_pretrained("/content/urdu_llm")
tokenizer = GPT2TokenizerFast.from_pretrained("/content/urdu_llm")
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(1000, 256)
    (wpe): Embedding(128, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-3): 4 x GPT2Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=768, nx=256)
          (c_proj): Conv1D(nf=256, nx=256)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=1024, nx=256)
          (c_proj): Conv1D(nf=256, nx=1024)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=256, out_features=1000, bias=False)
)

In [None]:
import torch

prompt = "پاکستان کا دارالحکومت"  # e.g., "Capital of Pakistan"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=100,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


پاکستان کا دارالحکومت کو گرزارالباریخریڈ وائس میں جینڈکشین، آزا کراڑنے والے وزیر اعتوکستانی کی باب کے بعد مین کا نبراتل پر لائٹوز کے لیے سندلاحلی پر وائرسمی سشاعت‮ Poned 09 Aptanni
