In [11]:
import pandas as pd
parquet_file_directory = "processed"
parquet_file_path = os.path.join(parquet_file_directory, "grainger_products.parquet")

    # "modules/vector_index/processed/grainger_products.parquet"

print("Attempting to load file from:", parquet_file_path)

# Now attempt to load the file
try:
    df = pd.read_parquet(parquet_file_path)
    print("File loaded successfully!")
except FileNotFoundError as e:
    print("Error loading file:", e)

print(df.head())

Attempting to load file from: processed/grainger_products.parquet
File loaded successfully!
                          Brand    Code  \
0                       TSUBAKI   1A912   
1  LION FIRE BOOTS BY THOROGOOD   3XRG7   
2           GLOWEAR BY ERGODYNE   1CXK5   
3                      PEERLESS  39R838   
4                     TOUGH GUY   4KN42   

                                                Name  \
0                  TSUBAKI Chain Detacher: 60 to 100   
1  Insulated Firefighter Boots: Insulated, Steel,...   
2  GLOWEAR BY ERGODYNE Baseball Cap: Orange, Univ...   
3                          Tire Chain: Passenger, Pr   
4  TOUGH GUY Trash Bags: 56 gal Capacity, 43 in W...   

                                       PictureUrl600    Price  \
0  https://static.grainger.com/rp/s/is/image/Grai...  $137.57   
1  https://static.grainger.com/rp/s/is/image/Grai...  $197.55   
2  https://static.grainger.com/rp/s/is/image/Grai...   $13.93   
3  https://static.grainger.com/rp/s/is/image/Grai...

In [14]:
import time
import os
import pandas as pd
import logging
from datetime import datetime
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import FAISS
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import S3FileLoader
from bedrock_initializer import LLMInitializer
from data_frame_initializer import DataFrameSingleton
# from .bedrock_initializer import LLMInitializer
# from .data_frame_initializer import DataFrameSingleton



logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def log_creation_time(file_path):
    ctime = os.path.getctime(file_path)
    creation_time = datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S')
    logging.info(f"File '{file_path}' was created on {creation_time}")


class Document:
    _instance = None
    _vector_index = None
    _df = None
    _llm = None
    _bedrock_embeddings = None

    def __new__(cls, *args, **kwargs):
        if cls._instance is None:
            cls._instance = super(Document, cls).__new__(cls)
        return cls._instance

    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    @classmethod
    def get_instance(cls, **kwargs):
        """Static access method to get the singleton instance, enforcing required arguments."""
        logging.info("Entering get_instance method")

        if cls._vector_index is None or cls._df is None:
            cls._llm = cls.initialize_llm()
            cls._bedrock_embeddings = cls.initialize_bedrock()
            documents = []
            # data_frame_singleton = DataFrameSingleton.get_instance()
            df.head()
            cls._df = df

            logging.info(f"DataFrame contains {cls._df.shape[0]} rows")

            for idx, (_, row) in enumerate(cls._df.iterrows()):
                logging.info(f"Processing row {idx + 1}/{cls._df.shape[0]} with code: {row['Code']}")
                page_content = f"{row['Code']} {row['Name']} {row['Brand']} {row['Description'] if pd.notna(row['Description']) else ''}"
                metadata = {
                    'Brand': row['Brand'],
                    'Code': row['Code'],
                    'Name': row['Name'],
                    'Description': row['Description'],
                    'Price': row['Price']
                }

                logging.debug(f"Page content for document {idx + 1}: {page_content}")
                logging.debug(f"Metadata for document {idx + 1}: {metadata}")

                # Check if the document is unique before appending
                if not any(doc.page_content == page_content for doc in documents):
                    documents.append(Document(page_content, metadata))
                else:
                    logging.warning(f"Duplicate document found for code: {row['Code']}")

            # Print the structured documents
            logging.info("Structured documents created:")
            for idx, doc in enumerate(documents[:5], 1):
                logging.info(f"Document {idx} of {len(documents)}:")
                logging.info(doc.page_content[:200])

            # Create FAISS vector store from structured documents
            logging.info("Creating FAISS vector store from structured documents...:", documents.pop().page_content[:200])
            start_time = time.time()
            cls._vector_index = FAISS.from_documents(documents=documents, embedding=cls._bedrock_embeddings)
            end_time = time.time()
            time_taken = end_time - start_time
            logging.info(f"Created FAISS vector store from structured documents in {time_taken} seconds.")

        return cls._vector_index, cls._llm, cls._bedrock_embeddings, cls._df

    @classmethod
    def recreate_index(cls, **kwargs):
        """Method to force the recreation of the vector index."""
        logging.info("Entering recreate_index method")
        cls._vector_index = None
        return cls.get_instance(**kwargs)

    @classmethod
    def initialize_llm(cls):
        logging.info("Setting up LLM")
        llm_initializer = LLMInitializer()
        llm, bedrock_runtime = llm_initializer.check_and_initialize_llm()
        if llm is None:
            logging.warning("Failed to initialize LLM")
            raise ValueError("Failed to initialize LLM")
        cls._llm = llm
        cls._bedrock_runtime = bedrock_runtime
        logging.info("LLM initialized")
        return cls._llm

    @classmethod
    def initialize_bedrock(cls):
        logging.info("Initializing Titan Embeddings Model...")
        bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=cls._bedrock_runtime)
        logging.info("Titan Embeddings Model initialized.")
        return bedrock_embeddings


In [15]:
document, llm, bedrock_embeddings, df = Document.get_instance()
print("Here")

2024-07-04 00:38:19,219 - INFO - Entering get_instance method
2024-07-04 00:38:19,221 - INFO - Setting up LLM
2024-07-04 00:38:19,223 - INFO - Checking if 'llm' is defined...
2024-07-04 00:38:19,224 - INFO - 'llm' initialized successfully.
2024-07-04 00:38:19,225 - INFO - LLM initialized
2024-07-04 00:38:19,226 - INFO - Initializing Titan Embeddings Model...
2024-07-04 00:38:19,227 - INFO - Titan Embeddings Model initialized.
2024-07-04 00:38:19,229 - INFO - DataFrame contains 3237 rows
2024-07-04 00:38:19,231 - INFO - Processing row 1/3237 with code: 1A912
2024-07-04 00:38:19,232 - INFO - Processing row 2/3237 with code: 3XRG7
2024-07-04 00:38:19,234 - INFO - Processing row 3/3237 with code: 1CXK5
2024-07-04 00:38:19,235 - INFO - Processing row 4/3237 with code: 39R838
2024-07-04 00:38:19,237 - INFO - Processing row 5/3237 with code: 4KN42
2024-07-04 00:38:19,239 - INFO - Processing row 6/3237 with code: 447Y46
2024-07-04 00:38:19,241 - INFO - Processing row 7/3237 with code: 404K04
2

Here


In [9]:
customer_input = "I am looking for waterproof insulated boots for my men working on my commercial deep sea fishing boat in the arctic. Must have large sizes."

In [10]:
## GET LIST OF PRODUCTS AND CODES
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
prompt_template2 = """Human: Extract list of 5 products and their respective physical IDs from catalog that matches the style given below. 
The catalog of products is provided under <catalog></catalog> tags below.
<catalog>
{context}
</catalog>
Style: {question}

The output should be a json of the form <products>[{{"product": <description of the product from the catalog>, "code":<code of the product from the catalog>}}, ...]</products>
Skip the preamble and always return valid json.
Assistant: """
PROMPT = PromptTemplate(
    template=prompt_template2, input_variables=["context", "question"]
)

# Use RetrievalQA customizations for improving Q&A experience
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=document.as_retriever(
        search_type="similarity", search_kwargs={"k": 6}
    ),
    return_source_documents=False,
    chain_type_kwargs={"prompt": PROMPT},
)

recs_response = qa({"query": customer_input})['result']
recs_response

  warn_deprecated(


' <products>\n[\n  {"product": "2UFU9 WESTWARD Mechanics Length Drill Bit: 1/8 in Drill Bit Size, 1 7/16 in Flute Lg, 1/8 in Shank Dia. WESTWARD <p>The black-and-gold-oxide finish on these drill bits lubricates the drill bit and provides some resistance to wear and rust. The bits are made from high-speed steel, which provides some flexibility and absorbs some shock and vibration.</p>", "code": "2UFU9"},\n  {"product": "2UFU9 WESTWARD Mechanics Length Drill Bit: 1/8 in Drill Bit Size, 1 7/16 in Flute Lg, 1/8 in Shank Dia. WESTWARD <p>The black-and-gold-oxide finish on these drill bits lubricates the drill bit and provides some resistance to wear and rust. The bits are made from high-speed steel, which provides some flexibility and absorbs some shock and vibration.</p>", "code": "2UFU9"},\n  {"product": "2UFU9 WESTWARD Mechanics Length Drill Bit: 1/8 in Drill Bit Size, 1 7/16 in Flute Lg, 1/8 in Shank Dia. WESTWARD <p>The black-and-gold-oxide finish on these drill bits lubricates the dri

In [3]:
import pandas as pd
import re
import html

# Example DataFrame
data = {
    'Code': ['1VCE8 ALTERNATIVE VENDOR', '2BZL6 PRIMARY VENDOR', '3CDE9 OTHER VENDOR']
}
df = pd.DataFrame(data)

def clean_code(code):
    # Extract the part before any space or other characters
    return re.split(r'\s|[-_()]+', code, 1)[0]

# Apply clean_code to 'Code' column
df['Code'] = df['Code'].apply(clean_code)

# Remove HTML characters from all columns (if any)
df = df.applymap(lambda x: html.unescape(x) if isinstance(x, str) else x)

print(df)


    Code
0  1VCE8
1  2BZL6
2  3CDE9
