In [1]:
# Importing the required libraries
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import os
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic
import re
import json

In [2]:
# Accessing the secrets from the environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "LLM_assessment_3a_CG.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
LLM_score_extraction_3a_CG = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
df_LLM_score_extraction_3a_CG = LLM_score_extraction_3a_CG['train'].to_pandas()

# Print a few rows to verify
print(df_LLM_score_extraction_3a_CG.head())

Dataset Path: LLM_assessment_3a_CG.csv


Generating train split: 0 examples [00:00, ? examples/s]

          DB_name                                              Query  \
0  concert_singer  SELECT T2.name ,  T2.capacity FROM concert AS ...   
1          pets_1  SELECT T1.fname ,  T1.age FROM student AS T1 J...   
2           car_1  SELECT T1.CountryName FROM COUNTRIES AS T1 JOI...   
3           car_1  SELECT T2.MakeId ,  T2.Make FROM CARS_DATA AS ...   
4           car_1  select t1.id ,  t1.maker from car_makers as t1...   

                                            Question  \
0  Show the stadium name and capacity with most n...   
1  Find the first name and age of students who ha...   
2  Which countries in europe have at least 3 car ...   
3  Among the cars with more than lowest horsepowe...   
4  Which are the car makers which produce at leas...   

                                              Output  \
0  Translation: The query aims to identify the st...   
1  Translation: The query aims to retrieve the fi...   
2  Translation: The Query aims to retrieve the na...   
3  Tra

In [4]:
# Initialize model
model = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o-mini")
parser = StrOutputParser()

# Updated template with escaped JSON format to avoid variable interpretation
testing_template_score = """
Extract the scores from each assessment column. Within these columns, extract the scores for "Understandability," "Accuracy," and "Overall Score." 
Please output the scores in the following structured format:

A1_Understandability: <score>
A1_Accuracy: <score>
A1_Overall: <score>
A2_Understandability: <score>
A2_Accuracy: <score>
A2_Overall: <score>
...
A6_Overall: <score>

Assessment_OAI_Explanation: {A1}
Assessment_OAI_Translation: {A2}
Assessment_Gemini_Explanation: {A3}
Assessment_Gemini_Translation: {A4}
Assessment_Claude_Explanation: {A5}
Assessment_Claude_Translation: {A6}
"""

prompt_testing_score = ChatPromptTemplate.from_template(testing_template_score)

# Chain setup for score extraction
chain_testing_LLM_score = (
    {
        "A1": RunnablePassthrough(),
        "A2": RunnablePassthrough(),
        "A3": RunnablePassthrough(),
        "A4": RunnablePassthrough(),
        "A5": RunnablePassthrough(),
        "A6": RunnablePassthrough(),
    }
    | prompt_testing_score
    | model
    | parser
)

# Function to process the DataFrame with score extraction using regex
def LLM_score_extraction_3a_CG(df_LLM_score_extraction_3a_CG):
    columns = [
        "OAI Explanation - Understandability Score", "OAI Explanation - Accuracy Score", "OAI Explanation - Overall Score",
        "OAI Translation - Understandability Score", "OAI Translation - Accuracy Score", "OAI Translation - Overall Score",
        "Gemini Explanation - Understandability Score", "Gemini Explanation - Accuracy Score", "Gemini Explanation - Overall Score",
        "Gemini Translation - Understandability Score", "Gemini Translation - Accuracy Score", "Gemini Translation - Overall Score",
        "Claude Explanation - Understandability Score", "Claude Explanation - Accuracy Score", "Claude Explanation - Overall Score",
        "Claude Translation - Understandability Score", "Claude Translation - Accuracy Score", "Claude Translation - Overall Score",
    ]
    extracted_scores = {col: [] for col in columns}

    for i, row in df_LLM_score_extraction_3a_CG.iterrows():
        # Prepare inputs for each assessment
        inputs = {
            "A1": row["Assessment OAI Explanation"],
            "A2": row["Assessment OAI Translation"],
            "A3": row["Assessment Gemini Explanation"],
            "A4": row["Assessment Gemini Translation"],
            "A5": row["Assessment Claude Explanation"],
            "A6": row["Assessment Claude Translation"]
        }

        # Invoke the chain and get raw response
        try:
            response = chain_testing_LLM_score.invoke(inputs)
            print(f"Row {i} - LLM response:\n{response}\n")  # Debug: Print the raw response

            # Extract scores using regex
            scores = {
                "OAI Explanation - Understandability Score": re.search(r"A1_Understandability:\s*(\d+)", response),
                "OAI Explanation - Accuracy Score": re.search(r"A1_Accuracy:\s*(\d+)", response),
                "OAI Explanation - Overall Score": re.search(r"A1_Overall:\s*(\d+)", response),

                "OAI Translation - Understandability Score": re.search(r"A2_Understandability:\s*(\d+)", response),
                "OAI Translation - Accuracy Score": re.search(r"A2_Accuracy:\s*(\d+)", response),
                "OAI Translation - Overall Score": re.search(r"A2_Overall:\s*(\d+)", response),

                "Gemini Explanation - Understandability Score": re.search(r"A3_Understandability:\s*(\d+)", response),
                "Gemini Explanation - Accuracy Score": re.search(r"A3_Accuracy:\s*(\d+)", response),
                "Gemini Explanation - Overall Score": re.search(r"A3_Overall:\s*(\d+)", response),

                "Gemini Translation - Understandability Score": re.search(r"A4_Understandability:\s*(\d+)", response),
                "Gemini Translation - Accuracy Score": re.search(r"A4_Accuracy:\s*(\d+)", response),
                "Gemini Translation - Overall Score": re.search(r"A4_Overall:\s*(\d+)", response),

                "Claude Explanation - Understandability Score": re.search(r"A5_Understandability:\s*(\d+)", response),
                "Claude Explanation - Accuracy Score": re.search(r"A5_Accuracy:\s*(\d+)", response),
                "Claude Explanation - Overall Score": re.search(r"A5_Overall:\s*(\d+)", response),

                "Claude Translation - Understandability Score": re.search(r"A6_Understandability:\s*(\d+)", response),
                "Claude Translation - Accuracy Score": re.search(r"A6_Accuracy:\s*(\d+)", response),
                "Claude Translation - Overall Score": re.search(r"A6_Overall:\s*(\d+)", response)
            }

            # Append extracted values or 'N/A' if not found
            for col, match in scores.items():
                extracted_scores[col].append(match.group(1) if match else "N/A")

        except Exception as e:
            print(f"Error in row {i}: {str(e)}")
            for col in columns:
                extracted_scores[col].append(f"Error: {str(e)}")

    # Append extracted columns to the DataFrame
    for col, values in extracted_scores.items():
        df_LLM_score_extraction_3a_CG[col] = values

    return df_LLM_score_extraction_3a_CG

# Call the function and save to CSV
df_score_extraction_3a_CG = LLM_score_extraction_3a_CG(df_LLM_score_extraction_3a_CG)
df_score_extraction_3a_CG.to_csv("LLM_score_extraction_3a_CG.csv", index=False)

Row 0 - LLM response:
Here are the extracted scores for "Understandability," "Accuracy," and "Overall Score" for each assessment:

```
A1_Understandability: 4
A1_Accuracy: 2
A1_Overall: 2
A2_Understandability: 3
A2_Accuracy: 2
A2_Overall: 2
A3_Understandability: 4
A3_Accuracy: 4
A3_Overall: 4
A4_Understandability: 4
A4_Accuracy: 4
A4_Overall: 4
A5_Understandability: 4
A5_Accuracy: 3
A5_Overall: 3
A6_Understandability: 4
A6_Accuracy: 3
A6_Overall: 3
```

Row 1 - LLM response:
Here are the extracted scores from each assessment column:

```
A1_Understandability: 3
A1_Accuracy: 3
A1_Overall: 3
A2_Understandability: 4
A2_Accuracy: 4
A2_Overall: 4
A3_Understandability: 4
A3_Accuracy: 4
A3_Overall: 4
A4_Understandability: 4
A4_Accuracy: 4
A4_Overall: 4
A5_Understandability: 3
A5_Accuracy: 3
A5_Overall: 3
A6_Understandability: 4
A6_Accuracy: 4
A6_Overall: 4
```

Row 2 - LLM response:
Here are the extracted scores for "Understandability," "Accuracy," and "Overall Score" from each assessment col