In [1]:
%run ../Setup.ipynb


Note: you may need to restart the kernel to use updated packages.



In [2]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "Testing_Output_3b.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
testing_output_3b = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
df_3b_testing_output = testing_output_3b['train'].to_pandas()

# Print a few rows to verify
print(df_3b_testing_output.head())

Dataset Path: Testing_Output_3b.csv


Generating train split: 0 examples [00:00, ? examples/s]

            DB_ID                                              Query  \
0  concert_singer  SELECT T2.name ,  T2.capacity FROM concert AS ...   
1          pets_1  SELECT T1.fname ,  T1.age FROM student AS T1 J...   
2           car_1  SELECT T1.CountryName FROM COUNTRIES AS T1 JOI...   
3           car_1  SELECT T2.MakeId ,  T2.Make FROM CARS_DATA AS ...   
4           car_1  select t1.id ,  t1.maker from car_makers as t1...   

                                            Question  \
0  Show the stadium name and capacity with most n...   
1  Find the first name and age of students who ha...   
2  Which countries in europe have at least 3 car ...   
3  Among the cars with more than lowest horsepowe...   
4  Which are the car makers which produce at leas...   

                                              Output  \
0  Translation: Show the name and capacity of the...   
1  Translation: Retrieve the first name and age o...   
2  Translation: Show the names of countries in Eu...   
3  Tra

**Version 1**

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained model for generating sentence embeddings
embedding_model_1 = SentenceTransformer('all-MiniLM-L6-v2')  # You can use other models as well

# Function to compute embeddings and similarity
def Translation_assessment_1(df_3b_testing_output):
    # Generate embeddings for the "Question" and "Translation" columns
    question_embeddings = embedding_model_1.encode(df_3b_testing_output['Question'].tolist(), convert_to_tensor=True)
    translation_embeddings = embedding_model_1.encode(df_3b_testing_output['Translation'].tolist(), convert_to_tensor=True)

    # Calculate cosine similarity for each row
    similarities = cosine_similarity(question_embeddings, translation_embeddings)

    # Since cosine_similarity returns a matrix, we extract the diagonal (row-wise comparison)
    df_3b_testing_output['Similarity_1'] = np.diagonal(similarities)

    return df_3b_testing_output

# Call the function and process the dataframe
df_translation_assessment_1 = Translation_assessment_1(df_3b_testing_output)



**Version 2**

In [4]:
embedding_model = OpenAIEmbeddings()

# Function to calculate embeddings
def get_embeddings(text):
    try:
        # Generate the embeddings using the LangChain OpenAIEmbeddings class
        return embedding_model.embed_query(text)
    except Exception as e:
        print(f"Error generating embeddings for text: {text}, Error: {e}")
        return None

# Function to calculate cosine similarity
def calculate_similarity(embedding1, embedding2):
    if embedding1 is not None and embedding2 is not None:
        # Cosine similarity expects 2D arrays, so reshape the 1D embeddings
        return cosine_similarity([embedding1], [embedding2])[0][0]
    return None

# Function to process the dataframe
def compare_question_translation(df_3b_testing_output):
    # Calculate similarity between 'Query' and 'Translation' without storing embeddings
    df_3b_testing_output['SimilarityV2'] = df_3b_testing_output.apply(
        lambda row: calculate_similarity(get_embeddings(row['Question']), get_embeddings(row['Translation'])), axis=1
    )
    
    return df_3b_testing_output

# Call the function and process the dataframe
df_translation_assessment_2 = compare_question_translation(df_3b_testing_output)

**Version 3**

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Load a pre-trained model for generating sentence embeddings
embedding_model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)

# Function to compute embeddings and similarity
def Translation_assessment_3(df_3b_testing_output):
    # Generate embeddings for the "Question" and "Translation" columns
    question_embeddings = embedding_model.encode(df_3b_testing_output['Question'].tolist(), convert_to_tensor=True)
    translation_embeddings = embedding_model.encode(df_3b_testing_output['Translation'].tolist(), convert_to_tensor=True)

    # Calculate cosine similarity for each row
    similarities = cosine_similarity(question_embeddings, translation_embeddings)

    # Since cosine_similarity returns a matrix, we extract the diagonal (row-wise comparison)
    df_3b_testing_output['Similarity_V3'] = np.diagonal(similarities)

    return df_3b_testing_output

# Call the function and process the dataframe
df_translation_assessment_3 = Translation_assessment_3(df_3b_testing_output)

# Saving to CSV with the similarity score
df_translation_assessment_3.to_csv('Translation_assessment_3b.csv', index=False)