***App Testing run and generation of file with App Output***

**Loading packages, libraries and secrets into notebook**

In [1]:
# Importing the required libraries
import os
from dotenv import load_dotenv
from datasets import load_dataset
import pandas as pd

In [2]:
# Accessing the secrets from the environment variables
load_dotenv()
MONGO_URI_SQL = os.getenv("MONGO_URI_SQL")
MONGO_URI_schema = os.getenv("MONGO_URI_Schema")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HF_Token = os.getenv("HF_TOKEN")

**Testing App 1b**

In [3]:
%run ../../A_Apps/1b_Openai_RAG_Schema.ipynb


Retrieved Documents:
page_content='SELECT city FROM business WHERE name  =  "Taj Mahal";' metadata={'_id': '66cf12c13c2173e47d7b0793', 'Question': 'Find all cities which has a " Taj Mahal " .'}
page_content='SELECT homepage FROM organization WHERE name  =  "University of Michigan";' metadata={'_id': '66cf12c13c2173e47d7b0806', 'Question': 'return me the homepage of " University of Michigan " .'}
page_content='SELECT state FROM business WHERE name  =  "Whataburger";' metadata={'_id': '66cf12c13c2173e47d7b0790', 'Question': 'Find all states in which there is a Whataburger'}
page_content='SELECT name FROM business WHERE rating  >  4.5;' metadata={'_id': '66cf12c13c2173e47d7b078d', 'Question': 'List all the businesses with more than 4.5 stars'}
Retrieved Schema:
page_content='cre_Doc_Template_Mgt DOCUMENTS DOCUMENT_DESCRIPTION' metadata={'_id': '66f699c316868673a7a59ee7', 'Column_name': 'DOCUMENT_DESCRIPTION', 'Table_name': 'DOCUMENTS', 'DB_name': 'cre_Doc_Template_Mgt'}
page_content='cre

In [4]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "../8_Testing_Input_and_Output/Spider_Testing_Selection.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
testing_1b = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
df_1b_testing = testing_1b["train"].to_pandas()

# Print a few rows to verify
print(df_1b_testing.head())

Dataset Path: ../8_Testing_Input_and_Output/Spider_Testing_Selection.csv
            DB_ID                                              Query  \
0  concert_singer  SELECT T2.name ,  T2.capacity FROM concert AS ...   
1          pets_1  SELECT T1.fname ,  T1.age FROM student AS T1 J...   
2           car_1  SELECT T1.CountryName FROM COUNTRIES AS T1 JOI...   
3           car_1  SELECT T2.MakeId ,  T2.Make FROM CARS_DATA AS ...   
4           car_1  select t1.id ,  t1.maker from car_makers as t1...   

                                            Question  
0  Show the stadium name and capacity with most n...  
1  Find the first name and age of students who ha...  
2  Which countries in europe have at least 3 car ...  
3  Among the cars with more than lowest horsepowe...  
4  Which are the car makers which produce at leas...  


In [5]:
# Function to run the chain for each query
def process_queries(df_1b_testing):
    # Create an empty list to store the results
    output = []

    for i, row in df_1b_testing.iterrows():
        # Get the query from the dataframe
        query = row["Query"]

        # Execute the chain with the current query
        try:
            result = chain_1b.invoke(query)
        except Exception as e:
            result = f"Error processing query {i}: {str(e)}"
        
        # Append the result to the list
        output.append(result)

    # Add the results to a new column in the dataframe
    df_1b_testing["Output"] = output
    
# Check and split the Output column into two: Translation and Explanation
    def split_output(text):
        if 'Explanation' in text:
            parts = text.split('Explanation', 1)
            return parts[0].strip(), parts[1].strip()
        else:
            return text, None  # If no "Explanation", return the text as translation, and None for explanation

    # Apply the splitting function to the Output column
    df_1b_testing[['Translation', 'Explanation']] = df_1b_testing["Output"].apply(lambda x: pd.Series(split_output(x)))
    
    return df_1b_testing

# Call the function and process the dataframe
df_1b_testing_output = process_queries(df_1b_testing)

# Now 'df_with_results' contains the original queries and their corresponding results
print(df_1b_testing_output)
df_1b_testing_output.to_csv("../8_Testing_Input_and_Output/App_Output_1b.csv", index=False)

Retrieved Schema:
page_content='concert_singer CONCERT  STADIUM_ID' metadata={'_id': '66f699c316868673a7a59fa9', 'Column_name': 'STADIUM_ID', 'Table_name': 'CONCERT ', 'DB_name': 'concert_singer'}
page_content='concert_singer STADIUM  STADIUM_ID' metadata={'_id': '66f699c316868673a7a59f96', 'Column_name': 'STADIUM_ID', 'Table_name': 'STADIUM ', 'DB_name': 'concert_singer'}
page_content='concert_singer SINGER  SINGER_ID' metadata={'_id': '66f699c316868673a7a59f9e', 'Column_name': 'SINGER_ID', 'Table_name': 'SINGER ', 'DB_name': 'concert_singer'}
page_content='concert_singer SINGER_IN_CONCERT  SINGER_ID' metadata={'_id': '66f699c316868673a7a59fae', 'Column_name': 'SINGER_ID', 'Table_name': 'SINGER_IN_CONCERT ', 'DB_name': 'concert_singer'}
page_content='concert_singer CONCERT  CONCERT_ID' metadata={'_id': '66f699c316868673a7a59fa6', 'Column_name': 'CONCERT_ID', 'Table_name': 'CONCERT ', 'DB_name': 'concert_singer'}
Retrieved Documents:
page_content='SELECT t1.name FROM stadium AS t1 JOIN

**Testing App 2b**

In [6]:
%run ../../A_Apps/2b_Openai_RAG.ipynb

Retrieved Documents:
page_content='SELECT t2.firstname FROM Performance AS t1 JOIN Band AS t2 ON t1.bandmate  =  t2.id JOIN Songs AS T3 ON T3.SongId  =  T1.SongId GROUP BY firstname ORDER BY count(*) DESC LIMIT 1' metadata={'_id': '66cf12c13c2173e47d7afc52', 'Question': 'Find the first name of the band mate that has performed in most songs.'}
page_content='SELECT t2.firstname FROM Performance AS t1 JOIN Band AS t2 ON t1.bandmate  =  t2.id JOIN Songs AS T3 ON T3.SongId  =  T1.SongId GROUP BY firstname ORDER BY count(*) DESC LIMIT 1' metadata={'_id': '66cf12c13c2173e47d7afc53', 'Question': 'What is the first name of the band mate who perfomed in the most songs?'}
page_content='SELECT T2.party_name FROM Member AS T1 JOIN party AS T2 ON T1.party_id  =  T2.party_id GROUP BY T1.party_id ORDER BY count(*) DESC LIMIT 1' metadata={'_id': '66cf12c13c2173e47d7aefd3', 'Question': 'What is the name of party with most number of members?'}
page_content='SELECT T2.party_name FROM Member AS T1 JOIN par

In [7]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "../8_Testing_Input_and_Output/Spider_Testing_Selection.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
testing_2b = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
df_2b_testing = testing_2b["train"].to_pandas()

# Print a few rows to verify
print(df_2b_testing.head())

Dataset Path: ../8_Testing_Input_and_Output/Spider_Testing_Selection.csv
            DB_ID                                              Query  \
0  concert_singer  SELECT T2.name ,  T2.capacity FROM concert AS ...   
1          pets_1  SELECT T1.fname ,  T1.age FROM student AS T1 J...   
2           car_1  SELECT T1.CountryName FROM COUNTRIES AS T1 JOI...   
3           car_1  SELECT T2.MakeId ,  T2.Make FROM CARS_DATA AS ...   
4           car_1  select t1.id ,  t1.maker from car_makers as t1...   

                                            Question  
0  Show the stadium name and capacity with most n...  
1  Find the first name and age of students who ha...  
2  Which countries in europe have at least 3 car ...  
3  Among the cars with more than lowest horsepowe...  
4  Which are the car makers which produce at leas...  


In [8]:
# Function to run the chain for each query
def process_queries(df_2b_testing):
    # Create an empty list to store the results
    output = []

    for i, row in df_2b_testing.iterrows():
        # Get the query from the dataframe
        query = row["Query"]

        # Execute the chain with the current query
        try:
            result = chain_2b.invoke(query)
        except Exception as e:
            result = f"Error processing query {i}: {str(e)}"
        
        # Append the result to the list
        output.append(result)

    # Add the results to a new column in the dataframe
    df_2b_testing["Output"] = output
    
# Check and split the Output column into two: Translation and Explanation
    def split_output(text):
        if 'Explanation' in text:
            parts = text.split('Explanation', 1)
            return parts[0].strip(), parts[1].strip()
        else:
            return text, None  # If no "Explanation:", return the text as translation, and None for explanation

    # Apply the splitting function to the Output column
    df_2b_testing[['Translation', 'Explanation']] = df_2b_testing["Output"].apply(lambda x: pd.Series(split_output(x)))
    
    return df_2b_testing

# Call the function and process the dataframe
df_2b_testing_output = process_queries(df_2b_testing)

# Now 'df_with_results' contains the original queries and their corresponding results
print(df_2b_testing_output)
df_2b_testing_output.to_csv("../8_Testing_Input_and_Output/App_Output_2b.csv", index=False)

Retrieved Documents:
page_content='SELECT t1.name FROM stadium AS t1 JOIN event AS t2 ON t1.id  =  t2.stadium_id GROUP BY t2.stadium_id ORDER BY count(*) DESC LIMIT 1' metadata={'_id': '66cf12c13c2173e47d7afdbc', 'Question': 'What is the name of the stadium which held the most events?'}
page_content='SELECT t3.name FROM record AS t1 JOIN event AS t2 ON t1.event_id  =  t2.id JOIN stadium AS t3 ON t3.id  =  t2.stadium_id GROUP BY t2.stadium_id ORDER BY count(*) DESC LIMIT 1' metadata={'_id': '66cf12c13c2173e47d7afdc5', 'Question': 'Find the names of stadiums that the most swimmers have been to.'}
page_content='SELECT t2.firstname FROM Performance AS t1 JOIN Band AS t2 ON t1.bandmate  =  t2.id JOIN Songs AS T3 ON T3.SongId  =  T1.SongId GROUP BY firstname ORDER BY count(*) DESC LIMIT 1' metadata={'_id': '66cf12c13c2173e47d7afc52', 'Question': 'Find the first name of the band mate that has performed in most songs.'}
page_content='SELECT t2.firstname FROM Performance AS t1 JOIN Band AS t2 O

**Testing App 3b**

In [9]:
%run ../../A_Apps/3b_Openai.ipynb

In [10]:
# Upload the dataset and transform to dataframe
# Define the dataset path
dataset_path = "../8_Testing_Input_and_Output/Spider_Testing_Selection.csv"
print("Dataset Path:", dataset_path)

# Check if the file exists at the specified path
if not os.path.isfile(dataset_path):
    raise FileNotFoundError(f"Unable to find the file at {dataset_path}")

# Load the dataset
testing_3b = load_dataset('csv', data_files=dataset_path)

# Convert the dataset to a pandas dataframe
df_3b_testing = testing_3b["train"].to_pandas()

# Print a few rows to verify
print(df_3b_testing.head())

Dataset Path: ../8_Testing_Input_and_Output/Spider_Testing_Selection.csv
            DB_ID                                              Query  \
0  concert_singer  SELECT T2.name ,  T2.capacity FROM concert AS ...   
1          pets_1  SELECT T1.fname ,  T1.age FROM student AS T1 J...   
2           car_1  SELECT T1.CountryName FROM COUNTRIES AS T1 JOI...   
3           car_1  SELECT T2.MakeId ,  T2.Make FROM CARS_DATA AS ...   
4           car_1  select t1.id ,  t1.maker from car_makers as t1...   

                                            Question  
0  Show the stadium name and capacity with most n...  
1  Find the first name and age of students who ha...  
2  Which countries in europe have at least 3 car ...  
3  Among the cars with more than lowest horsepowe...  
4  Which are the car makers which produce at leas...  


In [11]:
# Function to run the chain for each query
def process_queries(df_3b_testing):
    # Create an empty list to store the results
    output = []

    for i, row in df_3b_testing.iterrows():
        # Get the query from the dataframe
        query = row["Query"]

        # Execute the chain with the current query
        try:
            result = chain_3b.invoke(query)
        except Exception as e:
            result = f"Error processing query {i}: {str(e)}"
        
        # Append the result to the list
        output.append(result)

    # Add the results to a new column in the dataframe
    df_3b_testing["Output"] = output
    
# Check and split the Output column into two: Translation and Explanation
    def split_output(text):
        if 'Explanation' in text:
            parts = text.split('Explanation', 1)
            return parts[0].strip(), parts[1].strip()
        else:
            return text, None  # If no "Explanation:", return the text as translation, and None for explanation

    # Apply the splitting function to the Output column
    df_3b_testing[['Translation', 'Explanation']] = df_3b_testing["Output"].apply(lambda x: pd.Series(split_output(x)))
    
    return df_3b_testing

# Call the function and process the dataframe
df_3b_testing_output = process_queries(df_3b_testing)

# Now 'df_with_results' contains the original queries and their corresponding results
print(df_3b_testing_output)
df_3b_testing_output.to_csv("../8_Testing_Input_and_Output/App_Output_3b.csv", index=False)

                           DB_ID  \
0                 concert_singer   
1                         pets_1   
2                          car_1   
3                          car_1   
4                          car_1   
5                          car_1   
6                          car_1   
7                       flight_2   
8                       flight_2   
9       employee_hire_evaluation   
10      employee_hire_evaluation   
11          cre_Doc_Template_Mgt   
12                  course_teach   
13                  museum_visit   
14                  museum_visit   
15                         wta_1   
16  student_transcripts_tracking   
17  student_transcripts_tracking   
18                       voter_1   
19                       world_1   
20                       world_1   
21                       world_1   
22                       world_1   
23                       world_1   
24                       world_1   
25                     orchestra   
26                   dog_ken