In [2]:
import pandas as pd

df = pd.read_csv(
    "../data/All-Purpose Cleaners.csv"
)
df.fillna("")


Unnamed: 0,L0 Domain,L1 Category,L2 Category,L3 Sub-Category,L4 Product Enum-Code,Product Name,Product Quantity,UOM,MRP,Price,...,Images 9,Baby Weight,Absorption Duration (in Hrs),Features,Images 10,Care Instruction,Ingredients,Specification,Package Contains,About
0,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,"Harpic Disinfectant Toilet Cleaner Liquid, Ori...",500 ml,ml,99.0,94.00,...,https://datalabs.siva3.io/image/Eunimart_groce...,-,-,-,-,-,-,-,-,-
1,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,"Harpic Disinfectant Toilet Cleaner Liquid, Ori...",200 ml,ml,40.0,39.00,...,https://datalabs.siva3.io/image/Eunimart_groce...,-,-,-,-,-,-,-,-,-
2,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,"Harpic Disinfectant Toilet Cleaner Liquid, Ori...",1 L,L,215.0,201.00,...,https://datalabs.siva3.io/image/Eunimart_groce...,-,-,-,-,-,-,-,-,-
3,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,Harpic Disinfectant Toilet Cleaner Liquid - Or...,1 L each (Pack of 3),L,645.0,598.90,...,https://datalabs.siva3.io/image/Eunimart_groce...,-,-,-,-,-,-,-,-,-
4,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,Lizol Disinfectant Surface & Floor Cleaner Liq...,1L,L,220.0,191.00,...,,-,-,India's No. 1 Floor Cleaner with 99.9% Germ Ki...,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,"Emami Emasol All Surface Sanitizer, 3 x 25 ml ...",3 x 25 ml Multipack,ml,120.0,120.00,...,,-,-,New Advanced Formulation with a powerful comb...,-,-,-,-,-,-
576,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,Cif Perfect Finish Multi-Purpose Cleaner Spray...,435 ml,ml,289.0,216.75,...,,-,-,-,-,-,-,-,-,-
577,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,"Domestos Power 5 Ocean Toilet Rim Block, 55 g",55 g,g,199.0,149.25,...,,-,-,-,-,-,-,-,-,-
578,RET,RET-10,RET-10-14,RET-10-14-10,RET-10-14-10-11,"Astonish Fabric Stain Remover -Tough & Gentle,...",750 ml,ml,299.0,299.00,...,,-,-,-,-,-,"<5% Non-ionic Surfactants, Anionic Surfactants...",-,-,-


In [11]:
import os
import dotenv

dotenv.load_dotenv(
    "../ops/.env"
)

OPEN_AI_API_KEY = os.getenv(
    "OPEN_AI_API_KEY"
)

In [12]:
from llama_index.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent
)
from llama_index.query_engine.pandas import PandasInstructionParser
from llama_index.llms import OpenAI
from llama_index.prompts import PromptTemplate

In [13]:

instruction_str = (
    "1. Convert the query to executable Python code using Pandas, including a preliminary check to ensure all columns required for the query are present in the dataframe. Use `df.columns.isin(['required_column1', 'required_column2']).all()` to verify presence.\n"
    "2. If any required column is missing, the code should print a message indicating the missing columns. This step ensures the code does not attempt to execute a query with non-existent columns.\n"
    "3. Assuming all required columns are present, proceed to formulate the query using Pandas.\n"
    "4. The final line of code should be a Python expression suitable for execution with the `eval()` function, representing the query's solution.\n"
    "5. PRINT ONLY THE EXPRESSION or the message about missing columns.\n"
    "6. Do not quote the expression or the missing columns message.\n"
)

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)



response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results in a structured JSON format.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: Generate a JSON response that includes:\n"
    "- A `pass` key indicating if any rows meet the condition (true or false).\n"
)


In [14]:
pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)
llm = OpenAI(model="gpt-3.5-turbo")

In [15]:
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    },
    verbose=True,
)
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
# add link from response synthesis prompt to llm2
qp.add_link("response_synthesis_prompt", "llm2")

In [20]:
response = qp.run(
    query_str="Does the dataframe contain short descriptions and long descriptions columns ?",
)


[1;3;38;2;155;135;227m> Running module input with input: 
query_str: Does the dataframe contain short descriptions and long descriptions columns ?

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: Does the dataframe contain short descriptions and long descriptions columns ?

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
  L0 Domain L1 Category L2 Category L3 Sub-Category L4 Product Enum-Code...

[0m[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: df.columns.isin(['Short Description', 'Long Description']).all()

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: Does the dataframe contain short descriptions and long descriptions columns ?
pandas_instructions: assistant: df.columns.isin(['Short Description', 'Long Des

In [21]:
print(response.message.content)

{
  "pass": false
}
