In [1]:
# Sqlalchemy functions
from sqlalchemy import *
from sqlalchemy.engine import create_engine
from sqlalchemy.schema import *


#langchain functions
from langchain import hub
from langchain.agents import AgentExecutor, tool, load_tools
from crewai import Agent, Task, Crew, Process
from langchain_google_vertexai import ChatVertexAI
from langchain_google_vertexai import HarmBlockThreshold, HarmCategory
from vertexai.language_models import TextGenerationModel

#other necessary functions
import os
import json
import pandas as pd
import datetime
pd.set_option('display.max_colwidth', None)  # Set maximum width to terminal width
pd.set_option('display.expand_frame_repr', True)  # Expand DataFrame representation


In [2]:
from modules.multiagent import ProductInfoExtractor
from modules.dq_engine import DqEngine

In [3]:
# Example usage:
llm = ChatVertexAI(
    model_name='gemini-pro',
    temperature=0,
    top_k=1,
    safety_settings={
        HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, 
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH, 
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, 
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
}
)
tools_name=["llm-math"]
tools=load_tools(tools_name,llm)

In [4]:
# Define your agents with roles and goals
manager = Agent(
    role='Data Manager',
    goal="""As the Data Manager, your objective is to analyze product titles provided by the user and extract following fields in JSON format:
      - "net_content_quantity"
      - "net_content_uom"
      - "count_per_pack"
      - "multi_quantity"
      - "total_quantity"

       Follow the guidelines:
        - If count_per_pack is not found, set count_per_pack to 1.
        - If multi_quantity is not found, set multi_quantity to 1.
        - If neither count_per_pack nor multi_quantity is found, set both to 1.
        - total_quantity equals count_per_pack multiplied by multi_quantity.
        - For net_content_uom look for unit of measure words like ounce, oz, fl.oz, pound, liter, etc. If none is found, set net_content_quantity to Each.
        - For net_content_quantity look for numbers preceding unit of measure words like ounce, oz, fl.oz, etc. If none is found, set net_content_quantity to 1.

       For example, given the title "V8 +Energy, Juice Drink with Green Tea, Peach Mango, 8 oz. Can (4 packs of 6, Total of 24)," your response should be:
         {"net_content_quantity":8 ,"net_content_uom": "oz", "count_per_pack": 6, "multi_quantity": 4, "total_quantity": 24}

       Few More examples:
        - Title: V8, Strawberry Banana, 8 Ounce, 6 Count (Pack of 4)
          Response: {"net_content_quantity":8 ,"net_content_uom": "Ounce", "count_per_pack": 4, "multi_quantity": 6, "total_quantity": 24}
        - Title: V8 100% Vegetable Juice, Spicy Hot, 5.5 oz. Can, 6 Count
          Response: {"net_content_quantity":5.5 ,"net_content_uom": "oz", "count_per_pack": 1, "multi_quantity": 6, "total_quantity": 6}
        - Title: Twix Chocolate Cookie Bar Candy, 3 Pack
          Response: {"net_content_quantity":1 ,"net_content_uom": Each, "count_per_pack": 1, "multi_quantity": 3, "total_quantity": 3}
        - Title: EO Botanical Sulfate Free Conditioner with Keratin, Coconut and Hibiscus
          Response: {"net_content_quantity":1,"net_content_uom": "Each", "count_per_pack": 1, "multi_quantity": 1, "total_quantity": 1}
       """,
    backstory="""You are a leading expert in product data for a top retail organization. You are an expert at extracting and filling the correct product data.""",
    verbose=True,
    allow_delegation=False,
    llm=llm,  # was defined above in the file
)

supervisor = Agent(
    role='Validation Supervisor',
    goal="""As the Validation Supervisor, your objective is to review the Response generated by the Data Manager.
    Follow the guidelines:
        - the response must be valid JSON with the keys:
          - "net_content_quantity"
          - "net_content_uom,"
          - "count_per_pack,"
          - "multi_quantity,"
          - "total_quantity."
        - the JSON values should be reflected in the product Title.
        - If count_per_pack is not given and multi_quantity is given, set count_per_pack to 1.
        - If multi_quantity is not given and count_per_pack is given, set multi_quantity to 1.
        - If neither count_per_pack nor multi_quantity is given, set both to 1.
    
    Use the Calculator tool to confirm that "count_per_pack" * "multi_quanity" = "total_quantity".
    
    For example:
    - Response:
      "net_content_quantity":4 ,"net_content_uom": "Ounce", "count_per_pack": 6, "multi_quantity": 1, "total_quantity": 4
      THEN
      6 * 1
      Answer:
      6
      Final Answer:
      "net_content_quantity":4 ,"net_content_uom": "Ounce", "count_per_pack": 6, "multi_quantity": 1, "total_quantity": 6
    
    Another example:
    - Response:
      "net_content_quantity":2.5 ,"net_content_uom": "Ounce", "count_per_pack": 4, "multi_quantity": 3, "total_quantity": 2.5
      THEN
      4 * 3
      Answer:
      12
      Final Answer:
      "net_content_quantity":2.5 ,"net_content_uom": "Ounce", "count_per_pack": 4, "multi_quantity": 3, "total_quantity": 12
    """,
    backstory="""You are a leading expert in product data for a top retail organization. You are an expert at reviewing product data and confirming it is correct.""",
    verbose=True,
    allow_delegation=False,
    llm=llm,
    tools=tools
)

In [5]:
agents =[manager,supervisor]
task_descriptions = [
    """
    Its your turn to extract the fields. You dont need to use a tool and REMEMBER only return the JSON, NOTHING ELSE.
    Title: {product_title}""",
    """Review the Reponse provided by the Data Manager for the Title: {product_title}.
    Use the Calculator tool to check "count_per_pack" * "multi_quantity"
    THIS PART IS CRITICAL:  
    - NEVER use "net_content_quantity" in the Calculator tool.
    - ALWAYS update the Response with the correct answer for "total_quantity" based on the results from the Calculator tool."""
]


In [6]:
# Establish a connection to the BigQuery database
engine = create_engine('bigquery://us-gcp-ame-con-52dbb-sbx-1')

  engine = create_engine('bigquery://us-gcp-ame-con-52dbb-sbx-1')


In [7]:
# Create a ProductInfoExtractor object
extractor = ProductInfoExtractor(engine)
# Create a DqEngine object
dqe = DqEngine(engine)

In [8]:
# Calling `load_data` function with specific parameters:
df = extractor.load_data(schema_name='test_datasets',
                   table_name='sample_data_gdrive',
                   uids=['asin','category','title'], 
                   target_attribute=['net_content_quantity','net_content_uom','count_per_pack','multi_quantity','total_quantity'],
                   source_attribute='title'
                   )
df.shape

(22, 9)

In [9]:
df.dtypes

asin                                  object
category                              object
title                                 object
net_content_quantity                  object
net_content_uom                       object
count_per_pack                        object
multi_quantity                        object
total_quantity                        object
TARGET_ATTRIBUTE_source_attributes    object
dtype: object

In [12]:
df.iloc[[0],:]

Unnamed: 0,asin,category,title,net_content_quantity,net_content_uom,count_per_pack,multi_quantity,total_quantity,TARGET_ATTRIBUTE_source_attributes
0,B00WS3YLQ6,Prime_Pantry,"V8, Strawberry Banana, 8 Ounce, 6 Count (Pack of 4)",8,oz,4,6,24,"V8, Strawberry Banana, 8 Ounce, 6 Count (Pack of 4)"


In [11]:
#test_results = extractor.extract_attributes(df.iloc[[2,3,4],:],agents,task_descriptions)

In [12]:
#test_results

In [13]:
extractor.extract_attributes(df,agents,task_descriptions)


        ----------------------------------------------
        product title: V8, Strawberry Banana, 8 Ounce, 6 Count (Pack of 4)
        ----------------------------------------------


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? No
Final Answer: {"net_content_quantity":8 ,"net_content_uom": "Ounce", "count_per_pack": 4, "multi_quantity": 6, "total_quantity": 24}[0m

[1m> Finished chain.[0m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Calculator
Action Input: 4 * 6[0m[36;1m[1;3mAnswer: 24[0m[32;1m[1;3mDo I need to use a tool? No
Final Answer: {"net_content_quantity":8 ,"net_content_uom": "Ounce", "count_per_pack": 4, "multi_quantity": 6, "total_quantity": 24}[0m

[1m> Finished chain.[0m

        ----------------------------------------------
        product title: Diet Pepsi, Bottles 16.9 Fl Oz, 6 Count
        --------------------------------------------

INFO:root:  **Writing results to BigQuery table: net_content_quantity_attr_extr_20242814455**


['net_content_quantity', 'net_content_uom', 'count_per_pack', 'multi_quantity', 'total_quantity']
net_content_quantity


2 out of 2 rows loaded.0:00<?, ?it/s]INFO:pandas_gbq.gbq:
100%|██████████| 1/1 [00:00<00:00, 1302.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['modified_original_col'] = df[original].apply(lambda x: ''.join(e for e in str(x) if e.isalnum()).strip().lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['modified_new_col'] = df[new].apply(lambda x: ''.join(e for e in str(x) if e.isalnum()).strip().lower())
INFO:root:  **Writing results to BigQuery table: net_content_uom_attr_extr_2024281450**


net_content_uom


2 out of 2 rows loaded.0:00<?, ?it/s]INFO:pandas_gbq.gbq:
100%|██████████| 1/1 [00:00<00:00, 708.14it/s]
INFO:root:  **Writing results to BigQuery table: count_per_pack_attr_extr_2024281453**


count_per_pack


8 out of 8 rows loaded.0:00<?, ?it/s]INFO:pandas_gbq.gbq:
100%|██████████| 1/1 [00:00<00:00, 529.78it/s]
INFO:root:  **Writing results to BigQuery table: multi_quantity_attr_extr_2024281457**


multi_quantity


7 out of 7 rows loaded.0:00<?, ?it/s]INFO:pandas_gbq.gbq:
100%|██████████| 1/1 [00:00<00:00, 910.02it/s]
INFO:root:  **Writing results to BigQuery table: total_quantity_attr_extr_20242814511**


total_quantity


4 out of 4 rows loaded.0:00<?, ?it/s]INFO:pandas_gbq.gbq:
100%|██████████| 1/1 [00:00<00:00, 1202.50it/s]
