In [1]:
# Cell 1: Setup and Installation
!pip install sentence-transformers faiss-cpu transformers datasets peft bitsandbytes accelerate trl

print("✓ All dependencies installed!")

# Check GPU
import torch
print(f"\n🖥️  GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("   ⚠️  WARNING: No GPU detected! Please enable GPU:")
    print("   Runtime → Change runtime type → Hardware accelerator → GPU")

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata

In [2]:
from datasets import load_dataset

data = load_dataset("nlp-anonymous-researcher/LEGAL-UQA")

README.md:   0%|          | 0.00/683 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/480k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/222k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/495 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/124 [00:00<?, ? examples/s]

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__'],
        num_rows: 495
    })
    validation: Dataset({
        features: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__'],
        num_rows: 124
    })
})

In [4]:
df_train = data['train'].to_pandas()
df_val = data['validation'].to_pandas()

In [5]:
print("TRAIN SET:")
print(df_train.head())

TRAIN SET:
                                        question_eng  \
0  Can the National Assembly discuss or vote on t...   
1  What happens if no candidate secures a majorit...   
2  Can the state make special rules for women and...   
3  Who is responsible for presenting the budget t...   
4  How are the seats reserved for women in the Na...   

                                       question_urdu  \
0  کیا قومی اسمبلی منظور شدہ مصارف کی توثیق شدہ ج...   
1  اگر کسی امیدوار کو پہلی رائے شماری میں اکثریت ...   
2  کیا مملکت عورتوں اور بچوں کے لیے خاص قوانین بن...   
3  ہر سال صوبائی اسمبلی کے سامنے بجٹ پیش کرنے کی ...   
4  قومی اسمبلی میں خواتین کے لیے مخصوص نشستیں کیس...   

                                         context_eng  \
0  Authentication of schedule of authorized expen...   
1  The Cabinet\n\n3[91. (1) There shall be a Cabi...   
2  25A.\tEquality of citizens\n(1) All citizens a...   
3  Annual Budget Statement\n(1) The Provincial Go...   
4  National Assembly\n3[51. (1) The

In [6]:
print("\nVALIDATION SET:")
print(df_val.head())


VALIDATION SET:
                                        question_eng  \
0  What is the term duration for a member of the ...   
1  What areas were considered Tribal Areas in Pak...   
2  Can a High Court Judge be transferred to anoth...   
3  Who has the authority to amend the existing ru...   
4  What must the Party Head do before declaring a...   

                                       question_urdu  \
0       اسلامی کونسل کے رکن کی مدت کار کتنی ہوتی ہے؟   
1  پاکستان میں یوم آغاز سے قبل کن علاقوں کو قبائل...   
2  کیا کسی عدالت عالیہ کے جج کا اس کی رضامندی کے ...   
3  موجودہ قواعد و احکام میں ترمیم کرنے کا اختیار ...   
4  پارٹی کے سربراہ کو کسی رکن کو منحرف قرار دینے ...   

                                         context_eng  \
0  Composition, etc., of Islamic Council\n(1)    ...   
1  Tribal Areas\nIn the Constitution,—\n\n"Tribal...   
2  Transfer of High Court Judges\n\n(1)       The...   
3  Existing rules, etc., to continue\n\nUntil the...   
4  Disqualification on ground

In [7]:
from IPython.display import display
display(df_train.head())

Unnamed: 0,question_eng,question_urdu,context_eng,context_urdu,answer_eng,answer_urdu,context_index,__index_level_0__
0,Can the National Assembly discuss or vote on t...,کیا قومی اسمبلی منظور شدہ مصارف کی توثیق شدہ ج...,Authentication of schedule of authorized expen...,منظور شدہ مصارف کی توثیق\n\n(1) وزیر اعظم اپنے...,"No, the authenticated schedule of authorized e...",نہیں، منظور شدہ مصارف کی توثیق شدہ جدول کو قوم...,87,166
1,What happens if no candidate secures a majorit...,اگر کسی امیدوار کو پہلی رائے شماری میں اکثریت ...,The Cabinet\n\n3[91. (1) There shall be a Cabi...,صدر کو اس کے کارہائے منصبی کی انجام دہی میں مد...,If no candidate secures a majority in the firs...,اگر کسی امیدوار کو پہلی رائے شماری میں اکثریت ...,95,183
2,Can the state make special rules for women and...,کیا مملکت عورتوں اور بچوں کے لیے خاص قوانین بن...,25A.\tEquality of citizens\n(1) All citizens a...,۵۲ا۔ شہریوں سے مساوات\n(1) تمام شہری قانون کی ...,"Yes, the state can make special provisions for...",جی ہاں، مملکت عورتوں اور بچوں کے تحفظ کے لیے خ...,27,47
3,Who is responsible for presenting the budget t...,ہر سال صوبائی اسمبلی کے سامنے بجٹ پیش کرنے کی ...,Annual Budget Statement\n(1) The Provincial Go...,۰۰۱۔ صوبائی حکومت، ہر مالی سال کی بابت، صوبائی...,The Provincial Government is responsible for p...,ہر سال صوبائی اسمبلی کے سامنے بجٹ پیش کرنے کی ...,124,236
4,How are the seats reserved for women in the Na...,قومی اسمبلی میں خواتین کے لیے مخصوص نشستیں کیس...,National Assembly\n3[51. (1) There shall be 4[...,قومی اسمبلی\n\n51. (1) قومی اسمبلی میں اراکین ...,The seats reserved for women are allocated to ...,خواتین کے لیے مخصوص نشستیں ہر صوبے کے لیے قومی...,54,95


In [8]:
print(f"   Train samples: {len(df_train)}")
print(f"   Validation samples: {len(df_val)}")
print(f"   Total samples: {len(df_train) + len(df_val)}")

   Train samples: 495
   Validation samples: 124
   Total samples: 619


In [9]:
import pandas as pd
# Combine train and validation for our custom split
df = pd.concat([df_train, df_val], ignore_index=True)

In [10]:
print(f"   Combined dataset: {len(df)} samples")
print(f"   Columns: {list(df.columns)}")

   Combined dataset: 619 samples
   Columns: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__']


In [11]:
print("     Standardizing column names...")
df = df.rename(columns={
    'question_urdu': 'Question_urdu',
    'context_urdu': 'Context_urdu',
    'answer_urdu': 'Answer_urdu',
    'question_eng': 'Question_english',
    'context_eng': 'Context_english',
    'answer_eng': 'Answer_english',
    'context_index': 'Context_index'
})

     Standardizing column names...


In [12]:
if '__index_level_0__' in df.columns:
    df = df.drop(columns=['__index_level_0__'])

In [13]:
# Display sample data
print("Sample data:")
from IPython.display import display
display(df.head(3))

Sample data:


Unnamed: 0,Question_english,Question_urdu,Context_english,Context_urdu,Answer_english,Answer_urdu,Context_index
0,Can the National Assembly discuss or vote on t...,کیا قومی اسمبلی منظور شدہ مصارف کی توثیق شدہ ج...,Authentication of schedule of authorized expen...,منظور شدہ مصارف کی توثیق\n\n(1) وزیر اعظم اپنے...,"No, the authenticated schedule of authorized e...",نہیں، منظور شدہ مصارف کی توثیق شدہ جدول کو قوم...,87
1,What happens if no candidate secures a majorit...,اگر کسی امیدوار کو پہلی رائے شماری میں اکثریت ...,The Cabinet\n\n3[91. (1) There shall be a Cabi...,صدر کو اس کے کارہائے منصبی کی انجام دہی میں مد...,If no candidate secures a majority in the firs...,اگر کسی امیدوار کو پہلی رائے شماری میں اکثریت ...,95
2,Can the state make special rules for women and...,کیا مملکت عورتوں اور بچوں کے لیے خاص قوانین بن...,25A.\tEquality of citizens\n(1) All citizens a...,۵۲ا۔ شہریوں سے مساوات\n(1) تمام شہری قانون کی ...,"Yes, the state can make special provisions for...",جی ہاں، مملکت عورتوں اور بچوں کے تحفظ کے لیے خ...,27


In [14]:
# show an example
print("Detailed view of first example:")
print("="*34)
first_row = df.iloc[0]
print(f"\nQuestion (Urdu):")
print(f"  {first_row['Question_urdu']}")
print(f"\nQuestion (English):")
print(f"  {first_row['Question_english']}")
print(f"\nContext (Urdu) [first 200 chars]:")
print(f"  {first_row['Context_urdu'][:200]}...")
print(f"\nAnswer (Urdu):")
print(f"  {first_row['Answer_urdu']}")
print(f"\nContext Index: {first_row['Context_index']}")


Detailed view of first example:

Question (Urdu):
  کیا قومی اسمبلی منظور شدہ مصارف کی توثیق شدہ جدول پر بحث یا رائے شماری کر سکتی ہے؟

Question (English):
  Can the National Assembly discuss or vote on the authenticated schedule of authorized expenditure?

Context (Urdu) [first 200 chars]:
  منظور شدہ مصارف کی توثیق

(1) وزیر اعظم اپنے دستخطوں سے ایک جدول کی توثیق کرے گا جس میں حسب ذیل کی تصریح ہوگی:

- ان رقوم کی جو قومی اسمبلی نے آرٹیکل 82 کے تحت منظور کی ہوں یا جس کا منظور کیا جانا متص...

Answer (Urdu):
  نہیں، منظور شدہ مصارف کی توثیق شدہ جدول کو قومی اسمبلی میں پیش کیا جائے گا لیکن اس پر بحث یا رائے شماری نہیں ہوگی۔

Context Index: 87


In [15]:
# Check for missing values
print(" Checking data quality")
print(f"   Missing values:")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "   No missing values!")

 Checking data quality
   Missing values:
   No missing values!


In [16]:
import os

# Create the directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Save to CSV for backup
df.to_csv('data/legal_uqa.csv', index=False)
print(f"✓ Dataset saved to: data/legal_uqa.csv")

✓ Dataset saved to: data/legal_uqa.csv


In [17]:
import pandas as pd
import numpy as np
import re
import unicodedata
from sklearn.model_selection import train_test_split
import json

# **STEP 2: DATA PREPROCESSING**

In [18]:
def clean_urdu_text(text):
    """Clean and normalize Urdu text"""
    if pd.isna(text):
        return ""

    text = re.sub(r'\s+', ' ', str(text))
    text = text.replace('\u200c', '').replace('\u200b', '')
    text = text.strip()
    text = unicodedata.normalize('NFKC', text)

    return text

In [19]:
# Load the saved dataset
df = pd.read_csv('data/legal_uqa.csv')
print(f"   Total samples: {len(df)}")

   Total samples: 619


In [20]:
# Clean text columns
print("Cleaning Urdu text...")
df['Question_urdu_clean'] = df['Question_urdu'].apply(clean_urdu_text)
df['Context_urdu_clean'] = df['Context_urdu'].apply(clean_urdu_text)
df['Answer_urdu_clean'] = df['Answer_urdu'].apply(clean_urdu_text)

Cleaning Urdu text...


In [21]:
# Remove empty rows
df = df[
    (df['Question_urdu_clean'] != '') &
    (df['Context_urdu_clean'] != '') &
    (df['Answer_urdu_clean'] != '')
]
print(f"   Samples after cleaning: {len(df)}")

   Samples after cleaning: 619


In [22]:
# Extract unique contexts (keep English for reference only)
print("Extracting unique contexts...")
unique_contexts = df[['Context_index', 'Context_urdu_clean', 'Context_english']].drop_duplicates(subset=['Context_index'])
print(f"   Unique constitutional articles: {len(unique_contexts)}")

Extracting unique contexts...
   Unique constitutional articles: 305


In [23]:
import os
os.makedirs('processed_data', exist_ok=True)

# Save unique contexts
unique_contexts.to_csv('processed_data/unique_contexts.csv', index=False)

In [24]:
# Split data
print("Splitting data...")

np.random.seed(42)

# Shuffle the dataframe
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Calculate split indices
n_total = len(df_shuffled)
n_train = int(0.7 * n_total)  # 70%
n_val = int(0.15 * n_total)   # 15%
# Remaining will be test (~15%)

# Split
train_df = df_shuffled[:n_train]
val_df = df_shuffled[n_train:n_train+n_val]
test_df = df_shuffled[n_train+n_val:]

print(f"   Train: {len(train_df)} samples ({len(train_df)/n_total*100:.1f}%)")
print(f"   Validation: {len(val_df)} samples ({len(val_df)/n_total*100:.1f}%)")
print(f"   Test: {len(test_df)} samples ({len(test_df)/n_total*100:.1f}%)")

Splitting data...
   Train: 433 samples (70.0%)
   Validation: 92 samples (14.9%)
   Test: 94 samples (15.2%)


In [25]:
# Show context distribution
print(f"   Context distribution:")
print(f"   - Train contexts: {train_df['Context_index'].nunique()}")
print(f"   - Val contexts: {val_df['Context_index'].nunique()}")
print(f"   - Test contexts: {test_df['Context_index'].nunique()}")
print(f"   - Total unique contexts: {df['Context_index'].nunique()}")

   Context distribution:
   - Train contexts: 270
   - Val contexts: 82
   - Test contexts: 82
   - Total unique contexts: 305


In [26]:
# Create fine-tuning data
print(" Creating fine-tuning data (JSONL format)...")

def create_finetuning_data(df, unique_contexts, output_file):
    training_data = []

    for idx, row in df.iterrows():
        question = row['Question_urdu_clean']
        answer = row['Answer_urdu_clean']
        context_id = row['Context_index']

        context_row = unique_contexts[unique_contexts['Context_index'] == context_id]
        if len(context_row) == 0:
            continue
        context = context_row['Context_urdu_clean'].values[0]

        instruction = f"""آپ ایک پاکستانی آئینی قانون کے ماہر ہیں۔ دیے گئے آئینی سیاق و سباق کی بنیاد پر سوال کا جواب دیں۔

سیاق و سباق:
{context}

سوال:
{question}

جواب:"""

        training_example = {
            "instruction": instruction,
            "output": answer,
            "input": ""
        }

        training_data.append(training_example)

    with open(output_file, 'w', encoding='utf-8') as f:
        for example in training_data:
            f.write(json.dumps(example, ensure_ascii=False) + '\n')

    return len(training_data)

train_count = create_finetuning_data(train_df, unique_contexts, 'processed_data/train.jsonl')
val_count = create_finetuning_data(val_df, unique_contexts, 'processed_data/val.jsonl')

print(f"   Created {train_count} training examples")
print(f"   Created {val_count} validation examples")

print("\n" + "="*60)
print("✓ PREPROCESSING COMPLETE!")
print("="*60)
print("\nGenerated files:")
print("  - processed_data/unique_contexts.csv")
print("  - processed_data/train.csv")
print("  - processed_data/val.csv")
print("  - processed_data/test.csv")
print("  - processed_data/train.jsonl")
print("  - processed_data/val.jsonl")

 Creating fine-tuning data (JSONL format)...
   Created 433 training examples
   Created 92 validation examples

✓ PREPROCESSING COMPLETE!

Generated files:
  - processed_data/unique_contexts.csv
  - processed_data/train.csv
  - processed_data/val.csv
  - processed_data/test.csv
  - processed_data/train.jsonl
  - processed_data/val.jsonl


# **EMBEDDINGS**

In [27]:
from sentence_transformers import SentenceTransformer
import faiss

2025-12-02 08:41:21.925479: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764664882.137056      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764664882.186660      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [28]:
# Load unique contexts
print(" Loading knowledge base...")
unique_contexts = pd.read_csv('processed_data/unique_contexts.csv')
contexts_list = unique_contexts['Context_urdu_clean'].tolist()
context_ids = unique_contexts['Context_index'].tolist()
print(f" Total contexts: {len(unique_contexts)}")

 Loading knowledge base...
 Total contexts: 305


In [29]:
# Load embedding model
print("Loading embedding model...")
print("   Model: paraphrase-multilingual-mpnet-base-v2")
embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print("   ✓ Model loaded!")

Loading embedding model...
   Model: paraphrase-multilingual-mpnet-base-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   ✓ Model loaded!


In [30]:
# Create embeddings
print("Creating embeddings (this may take a few minutes)...")
context_embeddings = embedding_model.encode(
    contexts_list,
    convert_to_numpy=True,
    show_progress_bar=True,
    batch_size=32,
    normalize_embeddings=True
)

print(f"   ✓ Embedding shape: {context_embeddings.shape}")

Creating embeddings (this may take a few minutes)...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

   ✓ Embedding shape: (305, 768)


In [31]:
import os
os.makedirs('models', exist_ok=True)
# Save embeddings
np.save('models/context_embeddings.npy', context_embeddings)
np.save('models/context_ids.npy', np.array(context_ids))
print("   ✓ Embeddings saved!")

   ✓ Embeddings saved!


In [32]:
# Build FAISS index
print("Building FAISS index...")
embedding_dim = context_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)  # Inner product for cosine similarity
index.add(context_embeddings)

print(f"   ✓ Total vectors in index: {index.ntotal}")

Building FAISS index...
   ✓ Total vectors in index: 305


In [33]:
# Save index
faiss.write_index(index, "models/constitutional_law_index.faiss")
print("   ✓ Index saved!")

   ✓ Index saved!


In [34]:
# Test retrieval
print("Testing retrieval...")
test_question = "کیا قومی اسمبلی منظور شدہ مصارف کی توثیق شدہ جدول پر بحث یا رائے شماری کر سکتی ہے؟"
test_embedding = embedding_model.encode([test_question], normalize_embeddings=True)
scores, indices = index.search(test_embedding, 3)

print(f"\n   Test Question: {test_question}")
print(f"\n   Top 3 Retrieved Contexts:")
for i, idx in enumerate(indices[0]):
    print(f"\n   Rank {i+1}:")
    print(f"   - Context ID: {context_ids[idx]}")
    print(f"   - Similarity: {scores[0][i]:.4f}")
    print(f"   - Text: {contexts_list[idx][:150]}...")

print("\n✓ Embeddings and index creation complete!")

Testing retrieval...

   Test Question: کیا قومی اسمبلی منظور شدہ مصارف کی توثیق شدہ جدول پر بحث یا رائے شماری کر سکتی ہے؟

   Top 3 Retrieved Contexts:

   Rank 1:
   - Context ID: 86
   - Similarity: 0.7804
   - Text: سالانہ کیفیت نامہ میزانیہ کے اس حصہ پر جو وفاقی مجموعی فنڈ سے واجب الادا مصارف سے تعلق رکھتا ہو قومی اسمبلی میں بحث ہو سکے گی، لیکن اسے قومی اسمبلی کی...

   Rank 2:
   - Context ID: 80
   - Similarity: 0.7783
   - Text: کسی بھی ایوان میں زیر غور کوئی بل اس ایوان کی برخاستگی کی بناء پر ساقط نہیں ہوگا۔ سینیٹ میں زیر غور کوئی بل جسے قومی اسمبلی نے منظور نہ کیا ہو، قومی ا...

   Rank 3:
   - Context ID: 121
   - Similarity: 0.7616
   - Text: بل اجلاس کی برخاستگی وغیرہ (1) کسی صوبائی اسمبلی میں زیر غور کوئی بل، اسمبلی کی برخاستگی کی بنا پر ساقط نہیں ہوگا۔ (2) کسی صوبائی اسمبلی میں زیر غور ک...

✓ Embeddings and index creation complete!


# **Fine-tuning using Low Rank Adaptation (LoRa)**

In [35]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig
)

In [36]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset



In [37]:
# Model configuration
model_name = "muhammadnoman76/Lughaat-1.0-8B-Instruct"
# Alternative: "CohereForAI/aya-23-8B" (specifically designed for Urdu)

In [38]:
print(f" Model: {model_name}")
# print("   Note: You need to accept the license on HuggingFace first")
# print("   Visit: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct")

 Model: muhammadnoman76/Lughaat-1.0-8B-Instruct


In [39]:
# You may need to login to HuggingFace
# Uncomment the following lines if needed:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [40]:
# Quantization config
print(" Setting up 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

 Setting up 4-bit quantization...


In [41]:
# Load model
print(" Loading base model (this may take a few minutes)...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

 Loading base model (this may take a few minutes)...


config.json:   0%|          | 0.00/989 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

In [42]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("   ✓ Model and tokenizer loaded!")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

   ✓ Model and tokenizer loaded!


In [43]:
# Prepare for LoRA
print("Preparing model for LoRA...")
model = prepare_model_for_kbit_training(model)

Preparing model for LoRA...


In [44]:
# LoRA configuration
lora_config = LoraConfig(
    r=16,                    # LoRA rank
    lora_alpha=32,           # LoRA scaling
    target_modules=[         # Which layers to apply LoRA
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
print(" Trainable parameters:")
model.print_trainable_parameters()

 Trainable parameters:
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196


In [45]:
# Load training data
print(" Loading training data...")
train_dataset = load_dataset('json', data_files='processed_data/train.jsonl', split='train')
val_dataset = load_dataset('json', data_files='processed_data/val.jsonl', split='train')

print(f"   Training samples: {len(train_dataset)}")
print(f"   Validation samples: {len(val_dataset)}")

 Loading training data...


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

   Training samples: 433
   Validation samples: 92


In [46]:
def format_instruction(example):
    instruction = example['instruction']
    output = example['output']

    text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

آپ ایک پاکستانی آئینی قانون کے ماہر ہیں۔ دیے گئے سیاق و سباق کی بنیاد پر درست اور جامع جواب دیں۔<|eot_id|><|start_header_id|>user<|end_header_id|>

{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{output}<|eot_id|>"""

    return text

# print(" Formatting datasets...")
# train_dataset = train_dataset.map(format_instruction)
# val_dataset = val_dataset.map(format_instruction)

In [47]:
# Training arguments
print(" Setting up training configuration...")
training_args = TrainingArguments(
    output_dir="./urdu_constitutional_llm",

    # Training hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,

    # Learning rate
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,

    # Optimization
    optim="paged_adamw_32bit",
    weight_decay=0.01,
    max_grad_norm=0.3,

    # Logging
    logging_steps=10,
    logging_dir="./logs",

    # Evaluation
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=3,

    # Performance
    fp16=False,
    bf16=True,
    gradient_checkpointing=True,

    # Other
    load_best_model_at_end=True,
    report_to="none",
    remove_unused_columns=False,
)

 Setting up training configuration...


In [48]:
# Create trainer
print(" Creating trainer...")
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=lora_config,
    formatting_func=format_instruction,
    processing_class=tokenizer,
    args=training_args
)

 Creating trainer...




Applying formatting function to train dataset:   0%|          | 0/433 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/433 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/433 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/433 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/92 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/92 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/92 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/92 [00:00<?, ? examples/s]

In [49]:
# Start training
print(" Starting fine-tuning...")
print("="*60)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


 Starting fine-tuning...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
50,0.6003,0.631786,0.663643,534988.0,0.85424


  return fn(*args, **kwargs)


TrainOutput(global_step=84, training_loss=0.6798869967460632, metrics={'train_runtime': 10195.4169, 'train_samples_per_second': 0.127, 'train_steps_per_second': 0.008, 'total_flos': 4.039934886140314e+16, 'train_loss': 0.6798869967460632, 'epoch': 3.0})

In [50]:
# Save model
print(" Saving fine-tuned model...")
trainer.save_model("./final_urdu_constitutional_model")
tokenizer.save_pretrained("./final_urdu_constitutional_model")

print("\n" + "="*60)
print("✓ FINE-TUNING COMPLETE!")
print("="*60)
print("\nModel saved to: ./final_urdu_constitutional_model")


10. Saving fine-tuned model...

✓ FINE-TUNING COMPLETE!

Model saved to: ./final_urdu_constitutional_model


In [53]:
from shutil import make_archive
make_archive("final_urdu_constitutional_model", 'zip', "final_urdu_constitutional_model")

'/kaggle/working/final_urdu_constitutional_model.zip'

In [54]:
from shutil import make_archive
make_archive("urdu_constitutional_llm", 'zip', "urdu_constitutional_llm")

'/kaggle/working/urdu_constitutional_llm.zip'