# Finetuning Llama-2 on Hindi Dataset
* Base Model: meta-llama/Llama-2-7b-hf (https://huggingface.co/meta-llama/Llama-2-7b-hf)
* Dataset: https://huggingface.co/datasets/cfilt/iitb-english-hindi

## Setup

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kaggle-environments 1.14.15 requires transformers>=4.33.1, but you have transformers 4.31.0 which is incompatible.[0m[31m
[0m

In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

2024-08-03 17:45:29.504038: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-03 17:45:29.504147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-03 17:45:29.637199: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["WANDB_DISABLED"] = "true"

## Load Dataset

In [4]:
dataset = load_dataset('cfilt/iitb-english-hindi', split='train[:150000]')

Downloading readme:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [5]:
print(len(dataset))
dataset[1]

150000


{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

## Process Data for Model

In [18]:
# system_prompt = "Translate English to Hindi"

# def gen():
#   for i in range(len(dataset)):
#     yield {"text": f"<s>[INST] <<SYS>> {system_prompt} <</SYS>> {dataset[i]['translation']['en']} [/INST] {dataset[i]['translation']['hi']} </s>"}


# dataset = Dataset.from_generator(gen)

In [6]:
system_prompt = "Translate the following English text to Hindi accurately:"

def generate_prompt():
    for i in range(len(dataset)):
        yield {
            "text": (
                f"<s>[INST] <<SYS>> {system_prompt} <</SYS>>\n"
                f"### English:\n{dataset[i]['translation']['en']}\n"
                f"### Hindi:\n{dataset[i]['translation']['hi']} [/INST]</s>"
            )
        }

dataset = Dataset.from_generator(generate_prompt)

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
df = dataset.to_pandas()

In [8]:
df = df.drop_duplicates()

In [11]:
train_dataset, validation_dataset= train_test_split(df, test_size=0.1)


train_dataset = Dataset.from_pandas(train_dataset)
val_dataset = Dataset.from_pandas(validation_dataset)

# Optionally, you can wrap them in a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})


In [12]:
dataset = dataset_dict

## Load Configurations

In [13]:
# bitsandbytes parameters

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# bitsandbytes configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


In [14]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    #target_modules= ["q_proj","up_proj","o_proj","k_proj","down_proj","gate_proj","v_proj"]
)


## Load Model and Tokenizer

In [15]:
model_name = "meta-llama/Llama-2-7b-hf"

# Fine-tuned model name
new_model = "finetuned-llama2-7b-en-hi-pt"
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token = 'your hf auth token'
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_auth_token = 'your hf auth token')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [35]:
gradient_checkpointing = True

## Train and Save Model

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=1000,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    eval_steps=100,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map:   0%|          | 0/29654 [00:00<?, ? examples/s]

Map:   0%|          | 0/3295 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,0.9955
200,0.6934
300,0.6634
400,0.639
500,0.6073
600,0.6256
700,0.5895
800,0.6341
900,0.6056
1000,0.5912




In [None]:
trainer.evaluate()

## Generate Translation

In [None]:
import re
def translate_to_hindi(query, max_length= 128):
    non_english_chars_pattern = re.compile(r'[^a-zA-Z]+')
    system_prompt = "Translate the following English text to Hindi accurately:"
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
    result = pipe(f"[INST] <> {system_prompt} <>{query}[/INST]")
    result = result[0]['generated_text'].split('[/INST]')[1].split('Hindi:')
    return result

In [None]:
translate_to_hindi('Hi, How are you?')