In [1]:
!pip install -q torch                                  # Pytorch
!pip install -q transformers datasets                  # Comes from HuggingFace
!pip install -q bitsandbytes                           # For quantization from HuggingFace
!pip install -q peft                                   # Parameter-efficient Fine-tuning from HuggingFace
!pip install -q trl                                    # For supervised fine-tuning for LLMs from HuggingFace
!pip install -q accelerate        

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()
my_hf_key= os.getenv('HF_KEY')

login(token= my_hf_key)

In [3]:
import pandas as pd
dataset_B_complete = pd.read_csv("syn_data_3000.csv", encoding='utf-8')
print(dataset_B_complete.shape)

(3015, 2)


In [4]:
df_B = dataset_B_complete.sample(n=2000, random_state=42)  #we will select randomly 2000 examples. 

In [5]:
df_B.head()

Unnamed: 0,German,French
63,"Ich bin froh, dass ich einen guten Arbeitsplat...",Je suis heureux d'avoir trouvé un bon emploi a...
2683,Der Lehrer erklärt die Materie sehr gut und ma...,L'enseignant explique la matière très bien et ...
102,Meine Freunde und ich unterstützen uns gegense...,Mes amis et moi nous soutenons mutuellement da...
2691,"Wenn ich auf Reisen bin, probiere ich immer di...","Quand je suis en voyage, j'essaie toujours la ..."
416,"Am Ende zählt nicht nur der Erfolg, sondern au...","À la fin, ce ne sont pas seulement les succès ..."


In [6]:
df_B.shape

(2000, 2)

In [7]:
df_B.to_csv('df_B.csv', encoding='utf-8', index=False)

We will use all these examples (from dataset B) for training. But we will split 10% of them for validation.

In [8]:
from sklearn.model_selection import train_test_split

train_data, validation_data = train_test_split(df_B, test_size=0.1, random_state=42)
print(f"items in training and validation  are: {len(train_data)} and  {len(validation_data)}")



items in training and validation  are: 1800 and  200


In [9]:
from datasets import Dataset

# Convert the DataFrames into a Dataset
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)

In [10]:
train_dataset

Dataset({
    features: ['German', 'French', '__index_level_0__'],
    num_rows: 1800
})

In [11]:
def format_instruction(german_input, french_trans):
    return f"""### Instruction:
Translate to French. Do not add anything else.

### Input German Sentence:
{german_input.strip()}

### French Translation:
{french_trans.strip()}<|endoftext|>""" 
#EOS token for Qwen 

In [12]:
# Define a function that converts a dataset row into the corresponding prompt format
def convert_to_instruction_format(data_point):
    return {
        "text": format_instruction(data_point["German"], data_point["French"])
        }

In [13]:
# Apply the function to each row in the dataset
def process_dataset(data):
    return data.map(
        convert_to_instruction_format
        ).remove_columns(['German', 'French', '__index_level_0__']) #removing unnecessary columns

In [14]:
train_data = process_dataset(train_dataset)
validation_data = process_dataset(validation_dataset)

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [15]:
train_data

Dataset({
    features: ['text'],
    num_rows: 1800
})

#### Load the model and finetune

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct",device_map = "auto")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [17]:
# Set the padding token to be the same as the end-of-sequence token.

# Padding ensures all sequences in a batch are of equal length.
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be applied to the right side of the sequences.
# This is the standard behavior for causal language models. Causal models, predict the next token in a sequence using only the preceding tokens.
tokenizer.padding_side = "right"

In [18]:
# Import necessary PEFT objects for preparing the model for LoRA training
from peft import  LoraConfig, get_peft_model


model.enable_input_require_grads()


#model = prepare_model_for_kbit_training(model)  
#Use only if loading a quantized model-includes float conversions to help stabilize training

# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,                                                       # The rank (dimensions) of the LoRA matrices A and B
    lora_alpha=64,                                              # Scales the product of matrices AB [W_new = W_old + (A * B) * α]
    target_modules=["q_proj", "v_proj"],    # Apply LoRA to the attention matrices
    lora_dropout=0.1,                                           # Dropout rate to reduce overfitting
    bias="none",                                                # Do not train the bias parameter
    task_type="CAUSAL_LM"                                       # Task type for autoregressive text generation
)

# Get the model with unfrozen LoRA layers applied
model = get_peft_model(model, lora_config)

In [19]:
from trl import SFTConfig

# Set up the training hyperparameters
training_arguments = SFTConfig(
    fp16=True,                           # Use 16-bit precision for training computations (optimizer states, gradients)
    dataset_text_field="text",           # Specify the text field in the dataset for training
    max_seq_length=128,                 # Set the maximum sequence length for the training data

    # Batch-related parameters
    per_device_train_batch_size=4,       # Batch size per device during training

    # Optimizer-related parameters
    optim="paged_adamw_32bit",           # Use the paged AdamW optimizer, optimized for 32-bit GPUs
    learning_rate=1e-4,                  # Set the learning rate for training

    # Epochs and saving configuration
    num_train_epochs=4,                  # Number of training epochs (more epochs generally lead to better results)
    save_strategy="epoch",               # Save the model after each epoch
    output_dir="./epoch-finetuned",      # Directory to save the fine-tuned model

    # Validation-related parameters
    eval_strategy="steps",               # Evaluation strategy, performed at specified steps
    eval_steps=0.2,                      # Evaluate after 20% of the training steps

    # Logging-related parameters
    report_to="none",                    # Disable reporting to external tools
    logging_dir="./logs",                # Directory to save the training logs
    logging_steps=20,                    # Number of steps between each log entry
    seed=42,                             # Set a random seed for reproducibility
)

# Enable gradient checkpointing to save memory and recompute during backpropagation
model.gradient_checkpointing_enable()

# Disable attention cache during training; it should be enabled during inference
model.config.use_cache = False

In [20]:
# Import the SFTTrainer from HuggingFace TRL library
from trl import SFTTrainer

# Initialize the trainer
trainer = SFTTrainer(
    # Assign the model and tokenizer
    model=model,
    tokenizer=tokenizer,

    # Provide the training and validation datasets
    train_dataset= train_data,
    eval_dataset=validation_data,

    # Pass the LoRA configuration
    peft_config=lora_config,

    # Set the training hyperparameters
    args=training_arguments,
)

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/1800 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1800 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
# Start the training process 

trainer.train()

Step,Training Loss,Validation Loss
360,0.6,0.589053
720,0.5138,0.557075
1080,0.4846,0.537337
1440,0.4511,0.530814
1800,0.4716,0.527444


TrainOutput(global_step=1800, training_loss=0.5252584155400594, metrics={'train_runtime': 514.2178, 'train_samples_per_second': 14.002, 'train_steps_per_second': 3.5, 'total_flos': 3777190135418880.0, 'train_loss': 0.5252584155400594})

In [22]:
# Define the save path for the fine-tuned model on Colab
peft_model_path = "./fine-tuned-qwen2.5-1.5b-instruct-step7"

# Save the trained model
trainer.model.save_pretrained(peft_model_path)

# Save the tokenizer
tokenizer.save_pretrained(peft_model_path)

# List the saved files
!ls -lh {peft_model_path}

total 24M
-rwxr--r-- 1 prasanna99h prasanna99h 5.0K Mar 12 15:12 README.md
-rwxr--r-- 1 prasanna99h prasanna99h  722 Mar 12 15:12 adapter_config.json
-rwxr--r-- 1 prasanna99h prasanna99h 8.4M Mar 12 15:12 adapter_model.safetensors
-rwxr--r-- 1 prasanna99h prasanna99h  605 Mar 12 15:12 added_tokens.json
-rwxr--r-- 1 prasanna99h prasanna99h 1.6M Mar 12 15:12 merges.txt
-rwxr--r-- 1 prasanna99h prasanna99h  496 Mar 12 15:12 special_tokens_map.json
-rwxr--r-- 1 prasanna99h prasanna99h  11M Mar 12 15:12 tokenizer.json
-rwxr--r-- 1 prasanna99h prasanna99h 7.2K Mar 12 15:12 tokenizer_config.json
-rwxr--r-- 1 prasanna99h prasanna99h 2.7M Mar 12 15:12 vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Evaluation

#### Before loading the tuned model, lets move back the original model to cpu. We can also delete the model

In [23]:
import torch

model.to("cpu")  # Move the existing model to CPU

torch.cuda.empty_cache()  # Clear GPU cache

In [24]:
# For loading a PEFT model, we need to use a special object for CausalLM from PEFT
# instead of the regular HuggingFace object.
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch

# Load the fine-tuned model
peft_model_path = "./fine-tuned-qwen2.5-1.5b-instruct-step7"
tuned_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_path,
      
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tuned_model.to(device)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be added to the right side of the sequences
tokenizer.padding_side = "right"

# Enable attention cache during inference
tuned_model.config.use_cache = True

Now we load dataset A and generate the test set using same random seed as before.

In [25]:
test_A = pd.read_csv("deliverables/df_csv/test_A.csv", encoding='utf-8')
print(f'num of items: {len(test_A)}')

num of items: 200


In [26]:
def get_output_tuned_model(prompt):
  inputs = tokenizer (prompt, return_tensors="pt")
  inputs = inputs.to(tuned_model.device)

  output_tokens = tuned_model.generate(
    inputs.input_ids,
    max_new_tokens=40,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
)[0]
  output = tokenizer.decode(output_tokens, skip_special_tokens=True)
  return output

In [27]:
from tqdm import tqdm

translation_list=[]
for row in tqdm(range(len(test_A)),desc="Translating"):

  my_prompt = f"""### Input German Sentence:
{test_A.iloc[row, 0].strip()}

### French Translation:

"""
  translation = get_output_tuned_model(my_prompt)
  translation = translation.replace(my_prompt, "")
  translation_list.append(translation)
print(f'collected translations for {len(translation_list)} items.')

Translating: 100%|██████████| 200/200 [05:19<00:00,  1.60s/it]

collected translations for 200 items.





In [28]:
with open('translations_test_C_qwen.txt', 'w') as f:
    for item in translation_list:
        f.write(f"{item}\n")

In [29]:
import evaluate         ##metric for evaulation, HF library
def calculate_bleu_score(prediction, reference):


  #reference = [[reference.lower().split()]]
  #prediction = [prediction.lower().split()]
  prediction = [prediction]
  reference = [[reference]]

  bleu = evaluate.load("bleu")
  score = bleu.compute(predictions= prediction,references=reference)

  return score

In [30]:
from tqdm import tqdm

def get_evaluation_df(translation_list, actual_df):
  """
  translation_list is a list of translations (has just the target language sentences) from the llm to be evaluated
  actual_df : a dataframe containing the actual sentence and its translation
  """
  evaluation =[]
  evaluation_df = pd.DataFrame(columns=[
    'Actual', 
    'Reference', 
    'Prediction', 
    'BLEU Score', 
    'Precisions', 
    'Brevity Penalty', 
    'Length Ratio', 
    'Translation Length', 
    'Reference Length'
])
  for i in tqdm(range(len(translation_list))):
    actual = actual_df.iloc[i,0]
    reference = actual_df.iloc[i,1]
    prediction = translation_list[i]
    score = calculate_bleu_score(prediction, reference)

    evaluation_data = {
        'Actual': actual,
        'Reference': reference,
        'Prediction': prediction,
        'BLEU Score': score['bleu'],
        'Precisions': score['precisions'],
        'Brevity Penalty': score['brevity_penalty'],
        'Length Ratio': score['length_ratio'],
        'Translation Length': score['translation_length'],
        'Reference Length': score['reference_length']
    }
    evaluation.append(evaluation_data)
    
    # Concatenate all evaluation data into the DataFrame
    evaluation_df_concat = pd.concat([evaluation_df, pd.DataFrame(evaluation)], ignore_index=True)

  return evaluation_df_concat


In [31]:
result_df = get_evaluation_df(translation_list, test_A )

  evaluation_df_concat = pd.concat([evaluation_df, pd.DataFrame(evaluation)], ignore_index=True)
100%|██████████| 200/200 [01:17<00:00,  2.58it/s]


In [32]:
result_df.shape

(200, 9)

In [33]:
print(f'average bleu score is {result_df["BLEU Score"].mean()}')

average bleu score is 0.013786942341864698


In [34]:
result_df.to_csv('results_df_C_step8.csv', encoding='utf-8')