In [1]:
# Importing the required libraries
import os
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from sentence_transformers.quantization import quantize_embeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np





In [2]:
# Accessing the secrets from the environment variables
load_dotenv()
HF_Token = os.getenv("HF_TOKEN")

In [3]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "../8_Testing_Input_and_Output/App_Output_1a.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
testing_output_1a = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
df_1a_testing_output = testing_output_1a['train'].to_pandas()

# Print a few rows to verify
print(df_1a_testing_output.head())

Dataset Path: ../8_Testing_Input_and_Output/App_Output_1a.csv


Generating train split: 0 examples [00:00, ? examples/s]

            DB_ID                                              Query  \
0  concert_singer  SELECT T2.name ,  T2.capacity FROM concert AS ...   
1          pets_1  SELECT T1.fname ,  T1.age FROM student AS T1 J...   
2           car_1  SELECT T1.CountryName FROM COUNTRIES AS T1 JOI...   
3           car_1  SELECT T2.MakeId ,  T2.Make FROM CARS_DATA AS ...   
4           car_1  select t1.id ,  t1.maker from car_makers as t1...   

                                            Question  \
0  Show the stadium name and capacity with most n...   
1  Find the first name and age of students who ha...   
2  Which countries in europe have at least 3 car ...   
3  Among the cars with more than lowest horsepowe...   
4  Which are the car makers which produce at leas...   

                                              Output  \
0  The query aims to find the name of the stadium...   
1  The query aims to find the first name and age ...   
2  The query aims to identify the countries that ...   
3  The

**Version 1**

In [4]:
# Load a pre-trained model for generating sentence embeddings
embedding_model_1 = AutoModel.from_pretrained('jinaai/jina-embeddings-v3', trust_remote_code=True) # https://huggingface.co/jinaai/jina-embeddings-v3

# Function to compute embeddings and similarity
def Translation_assessment_1(df_1a_testing_output):
    df_1a_testing_output['Question'] = df_1a_testing_output['Question'].fillna('').astype(str)
    df_1a_testing_output['Translation'] = df_1a_testing_output['Translation'].fillna('').astype(str)
    
    # Generate embeddings for the "Question" and "Translation" columns
    question_embeddings = embedding_model_1.encode(df_1a_testing_output['Question'].tolist(), task="text-matching", convert_to_tensor=True)
    translation_embeddings = embedding_model_1.encode(df_1a_testing_output['Translation'].tolist(), task="text-matching", convert_to_tensor=True)

    # Calculate cosine similarity for each row
    similarities = cosine_similarity(question_embeddings, translation_embeddings)

    # Since cosine_similarity returns a matrix, we extract the diagonal (row-wise comparison)
    df_1a_testing_output['Similarity_1'] = np.diagonal(similarities)

    return df_1a_testing_output

# Call the function and process the dataframe
df_translation_assessment_1 = Translation_assessment_1(df_1a_testing_output)

configuration_xlm_roberta.py:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_lora.py:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

mha.py:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

rotary.py:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


xlm_padding.py:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- xlm_padding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_xlm_roberta.py:   0%|          | 0.00/50.0k [00:00<?, ?B/s]

mlp.py:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


embedding.py:   0%|          | 0.00/3.88k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


block.py:   0%|          | 0.00/17.8k [00:00<?, ?B/s]

stochastic_depth.py:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- block.py
- stochastic_depth.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- modeling_xlm_roberta.py
- mlp.py
- embedding.py
- block.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggin

**Version 2**

In [6]:
# 1. Specify preferred dimensions
dimensions = 512

# 2. Load the model
embedding_model_2 = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)  # https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1

# Function to generate a detailed instruction for the query
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Function to compute embeddings and similarity
def Translation_assessment_2(df_1a_testing_output):
    # Define task instruction for the queries
    task = 'Compare the question and translation to assess the quality of the translation.'

    # Add instruction to the "Question" column
    questions_with_instructions = [
        get_detailed_instruct(task, question) for question in df_1a_testing_output['Question'].tolist()
    ]

    # Generate a list of documents to encode
    docs = questions_with_instructions + df_1a_testing_output['Translation'].tolist()
    
    # 2. Encode
    embeddings = embedding_model_2.encode(docs)

    # Optional: Quantize the embeddings
    binary_embeddings = quantize_embeddings(embeddings, precision="ubinary")

    # Calculate cosine similarity between the first half (questions) and the second half (translations)
    question_embeddings = embeddings[:len(questions_with_instructions)]
    translation_embeddings = embeddings[len(questions_with_instructions):]

    # Calculate cosine similarity
    similarities = cos_sim(question_embeddings, translation_embeddings)

    # Since cos_sim returns a matrix, we extract the diagonal (row-wise comparison)
    df_1a_testing_output['Similarity_V2'] = np.diagonal(similarities.cpu().numpy())

    return df_1a_testing_output

# Call the function and process the dataframe
df_translation_assessment_2 = Translation_assessment_2(df_1a_testing_output)


**Version 3**

In [7]:
# Load a pre-trained model for generating sentence embeddings
embedding_model = SentenceTransformer("thenlper/gte-large")

# Function to compute embeddings and similarity
def Translation_assessment_3(df_1a_testing_output):
    # Generate embeddings for the "Question" and "Translation" columns
    question_embeddings = embedding_model.encode(df_1a_testing_output['Question'].tolist(), convert_to_tensor=True)
    translation_embeddings = embedding_model.encode(df_1a_testing_output['Translation'].tolist(), convert_to_tensor=True)

    # Calculate cosine similarity for each row
    similarities = cosine_similarity(question_embeddings, translation_embeddings)

    # Since cosine_similarity returns a matrix, we extract the diagonal (row-wise comparison)
    df_1a_testing_output['Similarity_V3'] = np.diagonal(similarities)

    return df_1a_testing_output

# Call the function and process the dataframe
df_translation_assessment_3 = Translation_assessment_3(df_1a_testing_output)

# Saving to CSV with the similarity score
df_translation_assessment_3.to_csv('../8_Testing_Input_and_Output/Translation_assessment_1a.csv', index=False)