In [1]:
!pip install -q torch
!pip install -q transformers datasets
!pip install -q bitsandbytes                           # For quantization from HuggingFace(HF)
!pip install -q peft                                   # Parameter-efficient Fine-tuning from HF
!pip install -q trl                                    # For supervised fine-tuning for LLMs from HF
!pip install -q accelerate                             # For distributed training from HF

In [2]:
!nvidia-smi

Wed Mar 12 14:38:36 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:1E.0 Off |                    0 |
| N/A   27C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

from google.colab import userdata
my_api_key = userdata.get('HF_KEY')




Note: Since I exhausted colab GPU credits, I used lightning Ai that also provides 15 GPU units per month for students and researchers.
https://lightning.ai/

In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()
my_hf_key= os.getenv('HF_KEY')

login(token= my_hf_key)

  from .autonotebook import tqdm as notebook_tqdm


#### Load the train and test dataset A

In [4]:
import pandas as pd
train_A = pd.read_csv("deliverables/df_csv/train_A.csv", encoding='utf-8')
test_A = pd.read_csv("deliverables/df_csv/test_A.csv", encoding='utf-8')
print(f"items in training and test test are: {len(train_A)} and  {len(test_A)}")

items in training and test test are: 800 and  200


In [5]:
train_A.head()

Unnamed: 0,German,French
0,Der schärfste Geist in der Stadt!,Tu sais que je suis le meilleur fantôme.
1,Die Kraeturen sind über neun Fuss gross.,"Ces lézards nouveaux- nés mesurent plus de 2, ..."
2,"Magere kämpfen, bis sie Hamburger sind.",Les maigres sont coriaces.
3,"Hey, wie funktioniert 'n so was hier?",Comment ça marche?
4,"Scheiße, jetzt geht er zu weit.",ll est allé trop loin.


#### Load the pretrained LLM model

The selected model Qwen/Qwen2.5-1.5B-Instruct is a general purpose LLM with 1.5B parameters

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map = "auto")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


The model takes around 6.1 GB of memory!!

In [7]:
#Testing the model using piepeline

# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model=model, tokenizer= tokenizer)
pipe(messages)

Device set to use cuda:0


[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': "I am Qwen, an AI language model developed by Alibaba Cloud. I'm here to help you"}]}]

In [8]:
#Test 2

from tqdm import tqdm

translations=[]
for i in tqdm(range(4)):

  my_prompt = f"""
  Translate the following sentence to French. Provide only the translation and nothing else.
  Sentence:{test_A.iloc[i,0].strip()}
  Translation:
  
  """
  messages = [
      {"role": "user", "content": my_prompt},
  ]
  outputs = pipe(
    messages,
    max_new_tokens=512,
)
  translations.append(outputs[0]["generated_text"][-1]['content'])
translations

100%|██████████| 4/4 [00:01<00:00,  2.12it/s]


['Translation: Nous avons également cherché à votre intention.',
 "Je veux juste être sûr que c'est bien toi.",
 "Translation: Il n'y a aucun Hitler dans lui.",
 "Aujourd'hui, amis!"]

The test results seem good. 

But lets try to use it without using chat template

In [9]:
# Set the padding token to be the same as the end-of-sequence token.

# Padding ensures all sequences in a batch are of equal length.
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be applied to the right side of the sequences.
# This is the standard behavior for causal language models. Causal models, predict the next token in a sequence using only the preceding tokens.
tokenizer.padding_side = "right"

In [10]:
def get_output(prompt):
  inputs = tokenizer (prompt, return_tensors="pt")
  inputs = inputs.to(model.device)

  output_tokens = model.generate(
    inputs.input_ids,
    max_new_tokens=40,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
)[0]
  output = tokenizer.decode(output_tokens, skip_special_tokens=True)
  return output

In [11]:
#testing our function

my_prompt = f"""Translate the following sentence to French. Provide only the translation and nothing else.
  Sentence:{test_A.iloc[4,0].strip()}.
  Translation: """
print(my_prompt)
x = get_output(my_prompt)
x = x.replace(my_prompt, "").strip()
x

Translate the following sentence to French. Provide only the translation and nothing else.
  Sentence:Sie nehmen keine Rücksicht auf Kinder..
  Translation: 


"They do not take any consideration of children.. \n\nTranslation in English:\nThey don't give a damn about kids.\n\n  I'm sorry, but your request is unclear. Could you please provide more information"

The model seems to generate some irrelevant transaltions to english when prompted using inputs and outputs format unlike the chat-template format.   
The chat template format might be applying some kind of post processing to stop the model at the right time.

#### Let's evaluate the model on test dataset- test_A using a metric called bleu score

In [12]:
from tqdm import tqdm
import warnings


translation_list=[]
for row in tqdm(range(len(test_A)),desc="Translating"):

  my_prompt = f"""
  Translate the following sentence to French, provide only the translation and nothing else.
  Sentence: {test_A.iloc[row,0].strip()}
  Translation:

  """

  translation = get_output(my_prompt)
  translation = translation.replace(my_prompt, "")
  translation_list.append(translation)
print(f'collected translations for {len(translation_list)} items.')

Translating: 100%|██████████| 200/200 [04:55<00:00,  1.48s/it]

collected translations for 200 items.





In [13]:
translation_list[0:2]

[' "Et nous avons cherché aussi à votre service." \n\nThis is a direct translation of the German phrase into French. The meaning stays true as it\'s asking for someone\'s help or assistance.\n\nThe',
 ' Rabbi, je veux seulement être sûr que c\'est bien toi. \n\n  Explanation:\n  - "Rabbi" is translated as "Rabbi".\n  - "Ich will" translates']

In [14]:
with open('translations_test_A_qwen.txt', 'w') as f:
    for item in translation_list:
        f.write(f"{item}\n")

In [15]:
import evaluate         ##metric for evaulation, HF library


def calculate_bleu_score(prediction, reference):

  #reference = [[reference.lower().split()]]
  #prediction = [prediction.lower().split()]
  prediction = [prediction]
  reference = [[reference]]
  #print(f'reference:{reference}, prediction:{prediction}')
  bleu = evaluate.load("bleu")
  score = bleu.compute(predictions= prediction,references=reference)

  return score


In [16]:
##test of our function


reference = test_A.iloc[0,1]
prediction = translation_list[0]

score = calculate_bleu_score(prediction, reference)
print(f"score: {score}")




score: {'bleu': 0.0, 'precisions': [0.02702702702702703, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 4.111111111111111, 'translation_length': 37, 'reference_length': 9}


In [17]:
from tqdm import tqdm

def get_evaluation_df(translation_list, actual_df):
  """
  function returns a evaluation dataframe that has parametrs of bleu score, the actual sentence(German), reference sentence(French)
  and predicted sentence.
  translation_list is a list of translations (has just the target language sentences) from the llm to be evaluated
  actual_df : a dataframe containing the actual sentence and its translation
  """
  evaluation =[]
  evaluation_df = pd.DataFrame(columns=[
    'Actual', 
    'Reference', 
    'Prediction', 
    'BLEU Score', 
    'Precisions', 
    'Brevity Penalty', 
    'Length Ratio', 
    'Translation Length', 
    'Reference Length'
])
  for i in tqdm(range(len(translation_list))):
    actual = actual_df.iloc[i,0]
    reference = actual_df.iloc[i,1]
    prediction = translation_list[i]
    score = calculate_bleu_score(prediction, reference)

    evaluation_data = {
        'Actual': actual,
        'Reference': reference,
        'Prediction': prediction,
        'BLEU Score': score['bleu'],
        'Precisions': score['precisions'],
        'Brevity Penalty': score['brevity_penalty'],
        'Length Ratio': score['length_ratio'],
        'Translation Length': score['translation_length'],
        'Reference Length': score['reference_length']
    }
    evaluation.append(evaluation_data)
    
    # Concatenate all evaluation data into the DataFrame
    evaluation_df_concat = pd.concat([evaluation_df, pd.DataFrame(evaluation)], ignore_index=True)

  return evaluation_df_concat


In [18]:
result_df = get_evaluation_df(translation_list, test_A )

  evaluation_df_concat = pd.concat([evaluation_df, pd.DataFrame(evaluation)], ignore_index=True)
100%|██████████| 200/200 [01:53<00:00,  1.77it/s]


In [19]:
result_df.shape

(200, 9)

In [20]:
print(f'average bleu score is {result_df["BLEU Score"].mean()}')

average bleu score is 0.01386372551217847


In [21]:
result_df.to_csv('result_df_step3.csv', encoding='utf-8')

### Fine-Tuning

Now, let's finetune the model.
First, configure the training hyperparameters and set up the training loop.

Huggingface TRL (Transformers Reinforcement Learning) library, which simplifies the process of supervised fine-tuning by providing pre-built functionality.

This allows us to focus primarily on specifying our desired hyperparameters, while the library takes care of the underlying training process and setup.

For finetuing, we follow LoRA (Low-Rank Adaptation) fine-tuning.
LoRA is an efficient fine-tuning technique for large language models. It works by freezing the original model weights and introducing small, trainable matrices into each Transformer layer. These low-rank matrices allow us to modify the model’s behavior during training without significantly increasing the number of trainable parameters, making the process computationally efficient. This approach is particularly useful for adapting large models to specific tasks while keeping resource usage manageable.

Note: we won't be doing quantization (qLoRA), as the model is not that big.

In [22]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotary_emb): Qw

For fine-tuning, it's sufficient to add LoRA layers only to the self-attention layers, specifically the components: ["q_proj", "k_proj", "v_proj", "o_proj"]. These are the key components responsible for the attention mechanism.

q: query, k: key, v:value, o: output

Here we will be applying the finetuning only to "q_proj" query projections and "v_proj" value projections

According to (https://www.datacamp.com/blog/attention-mechanism-in-llms-intuition)
query_vector: "This is a vector representing the current focus or question the model has about a specific word in the sequence. It's like a flashlight the model shines on a particular word to understand its meaning in context."

value_vector: "This vector holds the actual information associated with each word. Once the model identifies relevant words through the key comparisons, it retrieves the corresponding value vectors to get the actual details needed for understanding."

key: Represents the tokens or positions being attended to.

In [23]:
# Import necessary PEFT objects for preparing the model for LoRA training
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model.enable_input_require_grads()


#model = prepare_model_for_kbit_training(model)  
#Use only if loading a quantized model-includes float conversions to help stabilize training

# Set up LoRA configuration
lora_config = LoraConfig(
    r=8,                                                       # The rank (dimensions) of the LoRA matrices A and B
    lora_alpha=32,                                              # Scales the product of matrices AB [W_new = W_old + (A * B) * α]
    target_modules=["q_proj","v_proj"],                         # Apply LoRA to the attention matrices
    lora_dropout=0.1,                                           # Dropout rate to reduce overfitting
    bias="none",                                                # Do not train the bias parameter
    task_type="CAUSAL_LM"                                       # Task type for autoregressive text generation
)

# Get the model with unfrozen LoRA layers applied
model = get_peft_model(model, lora_config)

In [25]:
from trl import SFTConfig

# Set up the training hyperparameters
training_arguments = SFTConfig(
    fp16=True,                           # Use 16-bit precision for training computations (optimizer states, gradients)
    dataset_text_field="text",           # Specify the text field in the dataset for training
    max_seq_length=128,                   # Set the maximum sequence length for the training data

    # Batch-related parameters
    per_device_train_batch_size=4,       # Batch size per device during training

    # Optimizer-related parameters
    optim="paged_adamw_32bit",           # Use the paged AdamW optimizer, optimized for 32-bit GPUs, paged_adamw_32bit
    learning_rate=1e-4,                  # Set the learning rate for training

    # Epochs and saving configuration
    num_train_epochs=4,                  # Number of training epochs (more epochs generally lead to better results)
    save_strategy="epoch",               # Save the model after each epoch
    output_dir="./epoch-finetuned",      # Directory to save the fine-tuned model

    # Validation-related parameters
    eval_strategy="steps",               # Evaluation strategy, performed at specified steps
    eval_steps=0.2,                      # Evaluate after 20% of the training steps

    # Logging-related parameters
    report_to="none",                    # Disable reporting to external tools
    logging_dir="./logs",                # Directory to save the training logs
    logging_steps=20,                    # Number of steps between each log entry
    seed=42,                             # Set a random seed for reproducibility
)

# Enable gradient checkpointing to save memory and recompute during backpropagation
model.gradient_checkpointing_enable()

# Disable attention cache during training; it should be enabled during inference
model.config.use_cache = False

Before, finetuning, we need to convert our dataset(dataframe format) to hugging face Dataset format.
Also, we will use 10% of training data for validation. 

In [26]:
from sklearn.model_selection import train_test_split

train_data, validation_data = train_test_split(train_A, test_size=0.1, random_state=42)
print(f"items in training and validation  are: {len(train_data)} and  {len(validation_data)}")

items in training and validation  are: 720 and  80


In [27]:
from datasets import Dataset

# Convert the DataFrames into a Dataset
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)

In [28]:
train_dataset

Dataset({
    features: ['German', 'French', '__index_level_0__'],
    num_rows: 720
})

We will write some functions to convert our dataset into instruction format so that it makes sense to the LLM during training

In [29]:
def format_instruction(german_input, french_trans):
    return f"""### Instruction:Translate to French.
### Input German Sentence:
{german_input.strip()}

### French Translation:
{french_trans.strip()}<|endoftext|>"""

In [30]:
# Define a function that converts a dataset row into the corresponding prompt format
def convert_to_instruction_format(data_point):
    return {
        "text": format_instruction(data_point["German"], data_point["French"])
        }

In [31]:
# Apply the function to each row in the dataset
def process_dataset(data):
    return data.map(
        convert_to_instruction_format
        ).remove_columns(['German', 'French', '__index_level_0__']) #removing unnecessary columns

In [32]:
train_data = process_dataset(train_dataset)
validation_data = process_dataset(validation_dataset)

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [33]:
train_data

Dataset({
    features: ['text'],
    num_rows: 720
})

In [34]:
# Import the SFTTrainer from HuggingFace TRL library
from trl import SFTTrainer

# Initialize the trainer
trainer = SFTTrainer(
    # Assign the model and tokenizer
    model=model,
    tokenizer=tokenizer,

    # Provide the training and validation datasets
    train_dataset= train_data,
    eval_dataset=validation_data,

    # Pass the LoRA configuration
    peft_config=lora_config,

    # Set the training hyperparameters
    args=training_arguments,
)

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/720 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/80 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/80 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/80 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/80 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [35]:
# Start the training process 

trainer.train()

Step,Training Loss,Validation Loss
144,1.686,1.762383
288,1.5925,1.732659
432,1.6419,1.726233
576,1.5757,1.725743
720,1.6843,1.724211


TrainOutput(global_step=720, training_loss=1.6857305341296727, metrics={'train_runtime': 186.6587, 'train_samples_per_second': 15.429, 'train_steps_per_second': 3.857, 'total_flos': 1281445605187584.0, 'train_loss': 1.6857305341296727})

In [36]:
# Define the save path for the fine-tuned model on Colab
peft_model_path = "./fine-tuned-qwen2.5-1.5b-instruct-step4"

# Save the trained model
trainer.model.save_pretrained(peft_model_path)

# Save the tokenizer
tokenizer.save_pretrained(peft_model_path)

# List the saved files
!ls -lh {peft_model_path}

total 20M
-rwxr--r-- 1 prasanna99h prasanna99h 5.0K Mar 12 14:49 README.md
-rwxr--r-- 1 prasanna99h prasanna99h  721 Mar 12 14:49 adapter_config.json
-rwxr--r-- 1 prasanna99h prasanna99h 4.2M Mar 12 14:49 adapter_model.safetensors
-rwxr--r-- 1 prasanna99h prasanna99h  605 Mar 12 14:49 added_tokens.json
-rwxr--r-- 1 prasanna99h prasanna99h 1.6M Mar 12 14:49 merges.txt
-rwxr--r-- 1 prasanna99h prasanna99h  496 Mar 12 14:49 special_tokens_map.json
-rwxr--r-- 1 prasanna99h prasanna99h  11M Mar 12 14:49 tokenizer.json
-rwxr--r-- 1 prasanna99h prasanna99h 7.2K Mar 12 14:49 tokenizer_config.json
-rwxr--r-- 1 prasanna99h prasanna99h 2.7M Mar 12 14:49 vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Evaluation
#### Before loading the tuned model, lets move back the original model to cpu. We can also delete the model

In [37]:
import torch

model.to("cpu")  # Move the existing model to CPU

torch.cuda.empty_cache()  # Clear GPU cache

In [38]:
# For loading a PEFT model, we need to use a special object for CausalLM from PEFT
# instead of the regular HuggingFace object.
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch

# Load the fine-tuned model
peft_model_path = "./fine-tuned-qwen2.5-1.5b-instruct-step4"
tuned_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_path,  
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tuned_model.to(device)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be added to the right side of the sequences
tokenizer.padding_side = "right"

# Enable attention cache during inference
tuned_model.config.use_cache = True

In [39]:
print(f"Tuned Model is on device: {next(tuned_model.parameters()).device}")

Tuned Model is on device: cuda:0


In [40]:
# Test the model
prompt = f"Translate the following sentence to French, provide only the translation and nothing else.Sentence:{test_A.iloc[2,0].strip()}"
inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to("cuda")

# Generate response

output_tokens = tuned_model.generate(inputs["input_ids"],
                               max_new_tokens=30,
                               pad_token_id=tuned_model.config.eos_token_id)[0]
output = tokenizer.decode(output_tokens, skip_special_tokens=True)
res = output.replace(prompt,"")

# print output
import textwrap
print('TRAINED MODEL GENERATED TEXT :')
print(textwrap.fill(res, width=80))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TRAINED MODEL GENERATED TEXT :
  French: Il n'a rien d'Hitler dans son odeur.


In [41]:
!nvidia-smi

Wed Mar 12 14:50:08 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:1E.0 Off |                    0 |
| N/A   40C    P0              33W /  70W |   6185MiB / 15360MiB |     14%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
def get_output_tuned_model(prompt):
  inputs = tokenizer (prompt, return_tensors="pt")
  inputs = inputs.to(tuned_model.device)

  output_tokens = tuned_model.generate(
    inputs.input_ids,
    max_new_tokens=40,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
)[0]
  output = tokenizer.decode(output_tokens, skip_special_tokens=True)
  return output

In [43]:
from tqdm import tqdm

translation_list=[]
for row in tqdm(range(len(test_A)),desc="Translating"):

  my_prompt = my_prompt = f"""German Sentence:
{test_A.iloc[row, 0].strip()}.
French Translation:
"""

  translation = get_output_tuned_model(my_prompt)
  translation = translation.replace(my_prompt, "")
  translation_list.append(translation)
print(f'collected translations for {len(translation_list)} items.')

Translating: 100%|██████████| 200/200 [05:21<00:00,  1.61s/it]

collected translations for 200 items.





In [44]:
with open('translations_test_B_qwen.txt', 'w') as f:
    for item in translation_list:
        f.write(f"{item}\n")

In [45]:
result_df = get_evaluation_df(translation_list, test_A )

  evaluation_df_concat = pd.concat([evaluation_df, pd.DataFrame(evaluation)], ignore_index=True)
100%|██████████| 200/200 [01:18<00:00,  2.55it/s]


In [46]:
result_df.shape

(200, 9)

In [47]:
print(f'average bleu score is {result_df["BLEU Score"].mean()}')

average bleu score is 0.008322255640905463


the average bleu score of 0.0083 is  lower than the previous score of 0.0138

In [48]:
result_df.to_csv('results_df_B_step5.csv', encoding='utf-8')