In [None]:

!pip install openai

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "YOUR API KEY HERE"
os.environ["PINECONE_KEY"] = "YOUR PINECONE KEY HERE"

In [None]:
!pip install beautifulsoup4 requests



# **Scraping part select HTML**

We get the links associated with Dishwasher and Refrigerator

In [None]:
import requests
from bs4 import BeautifulSoup


url = "https://www.partselect.com/Dishwasher-Parts.htm"
base_url = "https://www.partselect.com"

fridge_url = "https://www.partselect.com/Refrigerator-Parts.htm"

response = requests.get(url)
f_response = requests.get(fridge_url)
soup = BeautifulSoup(response.text, 'html.parser')
f_soup = BeautifulSoup(f_response.text, 'html.parser')
links = soup.find_all('a', string=lambda text: text and text.startswith('Dishwasher'))
fridge_links = f_soup.find_all('a', string=lambda text: text and text.startswith('Refrigerator'))
overall_links = []

for link in links+fridge_links:
    full_url = base_url + link['href']
    if full_url not in overall_links:
      overall_links.append(full_url)

In [None]:
fridge_links

[<a href="/Refrigerator-Parts.htm">Refrigerator</a>,
 <a href="/Refrigerator-Parts.htm">Refrigerator</a>,
 <a href="/Refrigerator-Trays-and-Shelves.htm">Refrigerator Trays and Shelves</a>,
 <a href="/Refrigerator-Drawers-and-Glides.htm">Refrigerator Drawers and Glides</a>,
 <a href="/Refrigerator-Filters.htm">Refrigerator Filters</a>,
 <a href="/Refrigerator-Ice-Makers.htm">Refrigerator Ice Makers</a>,
 <a href="/Refrigerator-Seals-and-Gaskets.htm">Refrigerator Seals and Gaskets</a>,
 <a href="/Refrigerator-Hardware.htm">Refrigerator Hardware</a>,
 <a href="/Refrigerator-Door-Shelves.htm">Refrigerator Door Shelves</a>,
 <a href="/Refrigerator-Hinges.htm">Refrigerator Hinges</a>,
 <a href="/Refrigerator-Lights-and-Bulbs.htm">Refrigerator Lights and Bulbs</a>,
 <a href="/Refrigerator-Switches.htm">Refrigerator Switches</a>,
 <a href="/Refrigerator-Thermostats.htm">Refrigerator Thermostats</a>,
 <a href="/Refrigerator-Motors.htm">Refrigerator Motors</a>,
 <a href="/Refrigerator-Caps-and-L

# Getting information from DOM elements

We scrap individual DOM elements and store them into a text file temporarily

In [None]:
from bs4 import BeautifulSoup
import requests

def parse_link(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')


    item_details = soup.find_all('div', class_='nf__part mb-3')

    for item in item_details:
        title = item.find('a', class_='nf__part__detail__title').get_text(strip=True) if item.find('a', class_='nf__part__detail__title') else 'No Title'
        part_select_number = item.find('div', class_='nf__part__detail__part-number').get_text(strip=True) if item.find('div', class_='nf__part__detail__part-number') else 'No PartSelect Number'
        manufacturer_part_number = item.find('div', class_='nf__part__detail__part-number mb-2').get_text(strip=True) if item.find('div', class_='nf__part__detail__part-number mb-2') else 'No Manufacturer Part Number'
        detail_text = item.get_text(separator=' ', strip=True)  # For general details within 'nf__part__detail'
        symptoms = item.find('div', class_='nf__part__detail__symptoms mt-3').get_text(strip=True) if item.find('div', class_='nf__part__detail__symptoms mt-3') else 'No Symptoms'
        instruction = item.find('div', class_='nf__part__detail__instruction mt-3').get_text(strip=True) if item.find('div', class_='nf__part__detail__instruction mt-3') else 'No Instruction'
        instruction_quote = item.find('div', class_='nf__part__detail__instruction__quote mt-1').get_text(strip=True) if item.find('div', class_='nf__part__detail__instruction__quote mt-1') else 'No Instruction Quote'
        price = item.find('div', class_='mt-sm-2 price').get_text(strip=True) if item.find('div', class_='mt-sm-2 price') else 'No Price'

        yield title, part_select_number, manufacturer_part_number, detail_text, symptoms, instruction, instruction_quote, price


with open('product_info.txt', 'w') as file:
    for link in overall_links:
        for info in parse_link(link):
            file.write(f"Title: {info[0]}\n")
            file.write(f"{info[1]}\n")
            file.write(f"{info[2]}\n")
            file.write(f"Part Detail/Instructions: {info[3]}\n")
            file.write(f"Symptoms: {info[4]}\n")
            file.write(f"Instruction: {info[5]}\n")
            file.write(f"Instruction Quote: {info[6]}\n")
            file.write(f"Price: {info[7]}\n\n")
            file.write("END OF LINE\n\n")
        file.write(f"Source: {link}\n\n")

In [None]:
with open('product_info.txt', 'r') as file:
        file_text = file.read()

In [None]:
file_text

'Title: Dishwasher Upper Rack Adjuster Kit - White Wheels, Left and Right Sides\nPartSelect Number PS10065979\nManufacturer Part Number W10712395\nPart Detail/Instructions: Videos! Your Price $ 47.83 In Stock Add to cart Dishwasher Upper Rack Adjuster Kit - White Wheels, Left and Right Sides ★★★★★ ★★★★★ 597 Reviews PartSelect Number PS10065979 Manufacturer Part Number W10712395 This authentic Dishwasher Upper Rack Adjuster Kit with white wheels is a kit made up of primarily plastic and metal components, which will require a screwdriver to install. This kit comes with all the necessary parts for a dishwasher rack adjuster.  It fits onto the dishwasher rack and connects it to the track allowing it to slide in and out. If broken then the rack will no longer slide properly, and the part must be replaced. In many cases, our customers have reported the top rack of their dishwasher hanging down or sagging due to the adjuster kit, particularly if the wheels, have broken or failed. If you are h

# Creating a custom chunker

For this particular usecase since Long term context isn't as important as short term context, I decided to chunk together product details for a particular part number or manufacturing number

In [None]:
def chunk_text(text, start_marker="Title:", end_marker="END OF LINE"):
    lines = text.split('\n')
    chunks = []
    current_chunk = []


    collecting = False

    for line in lines:
        if start_marker in line:
            collecting = True
            if current_chunk:
                chunks.append('\n'.join(current_chunk))
                current_chunk = []
        if collecting:
            current_chunk.append(line)
        if end_marker in line:
            collecting = False


    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks

In [None]:
chunks = chunk_text(file_text)

In [None]:
chunks

['Title: Dishwasher Upper Rack Adjuster Kit - White Wheels, Left and Right Sides\nPartSelect Number PS10065979\nManufacturer Part Number W10712395\nPart Detail/Instructions: Videos! Your Price $ 47.83 In Stock Add to cart Dishwasher Upper Rack Adjuster Kit - White Wheels, Left and Right Sides ★★★★★ ★★★★★ 597 Reviews PartSelect Number PS10065979 Manufacturer Part Number W10712395 This authentic Dishwasher Upper Rack Adjuster Kit with white wheels is a kit made up of primarily plastic and metal components, which will require a screwdriver to install. This kit comes with all the necessary parts for a dishwasher rack adjuster.  It fits onto the dishwasher rack and connects it to the track allowing it to slide in and out. If broken then the rack will no longer slide properly, and the part must be replaced. In many cases, our customers have reported the top rack of their dishwasher hanging down or sagging due to the adjuster kit, particularly if the wheels, have broken or failed. If you are 

# Getting links for Popular models

We scrap the links for popular models

In [None]:
model_links = []
def find_model_links(page_soup):
    for link in page_soup.find_all('a', href=True):

        if link['href'].startswith('/Models/'):

            full_link = base_url + link['href']
            model_links.append(full_link)


find_model_links(f_soup)
find_model_links(soup)

# Storing part and manufacturer numbers assoiciated with each part

In [None]:
with open('part_details.txt', 'w') as file:
    for link in model_links:

        response = requests.get(link)


        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')


            model_name_section = soup.find(id='PartsSectionTitle')
            if model_name_section:
                model_name = model_name_section.text.strip()
                file.write(f"Title: {model_name}\n")
                file.write(f"These are the part numbers and manufacturer numbers compatible with Model:{model_name}")


            bold_spans = soup.find_all('span', class_='bold')

            for span in bold_spans:

                if 'PartSelect #:' in span.text or 'Manufacturer #:' in span.text:

                    div_text = span.parent.text.strip()
                    file.write(f"{div_text}\n")


            file.write("END OF LINE\n\n")




In [None]:
with open('part_details.txt', 'r') as file:
        part_text = file.read()

model_chunks = chunk_text(part_text)

In [None]:
model_chunks

In [None]:
combined_chunks = chunks + model_chunks

In [None]:
combined_chunks

['Title: Dishwasher Upper Rack Adjuster Kit - White Wheels, Left and Right Sides\nPartSelect Number PS10065979\nManufacturer Part Number W10712395\nPart Detail/Instructions: Videos! Your Price $ 47.83 In Stock Add to cart Dishwasher Upper Rack Adjuster Kit - White Wheels, Left and Right Sides ★★★★★ ★★★★★ 597 Reviews PartSelect Number PS10065979 Manufacturer Part Number W10712395 This authentic Dishwasher Upper Rack Adjuster Kit with white wheels is a kit made up of primarily plastic and metal components, which will require a screwdriver to install. This kit comes with all the necessary parts for a dishwasher rack adjuster.  It fits onto the dishwasher rack and connects it to the track allowing it to slide in and out. If broken then the rack will no longer slide properly, and the part must be replaced. In many cases, our customers have reported the top rack of their dishwasher hanging down or sagging due to the adjuster kit, particularly if the wheels, have broken or failed. If you are 

In [None]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-3.2.2-py3-none-any.whl (215 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/215.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/215.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client
Successfully installed pinecone-client-3.2.2


# Creating the Embedding function and setting up Pinecone vector DB

In [None]:
from pinecone import Pinecone, ServerlessSpec

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
PINECONE_API_KEY = os.environ.get('PINECONE_KEY')
OPENAI_EMBEDDING_MODEL = 'text-embedding-ada-002'
CHATGPT_MODEL = 'gpt-4-1106-preview'

def get_embedding(chunk):
    url = 'https://api.openai.com/v1/embeddings'
    headers = {
      'content-type': 'application/json; charset=utf-8',
      'Authorization': f"Bearer {OPENAI_API_KEY}"
    }
    data = {
      'model': OPENAI_EMBEDDING_MODEL,
      'input': chunk
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    response_json = response.json()
    embedding = response_json["data"][0]["embedding"]
    return embedding

PINECONE_INDEX_NAME = 'index235'

pc = Pinecone(api_key=PINECONE_API_KEY)

EMBEDDING_DIMENSION = 1536



# Function to Embed each chunk of data and upserting them to the pinecone vector db

In [None]:
import json
import requests
from openai import OpenAI

def batch_upsert(index, vectors, batch_size=50):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)

def embed_chunks_and_upload_to_pinecone(chunks, index_name):
    if index_name in pc.list_indexes():
        print("\nIndex already exists. Deleting index ...")
        pc.delete_index(name=index_name)

    print("\nCreating a new index: ", index_name)
    pc.create_index(name=index_name,
                    dimension=EMBEDDING_DIMENSION,
                    metric='cosine',
                    spec=ServerlessSpec(cloud="aws", region="us-west-2"))

    index = pc.Index(index_name)
    print("\nEmbedding chunks using OpenAI ...")
    embeddings_with_ids = []
    for i, chunk in enumerate(chunks):
        embedding = get_embedding(chunk)
        # Ensure the metadata is a dictionary
        embeddings_with_ids.append((str(i), embedding, {"description": chunk}))

    print("\nUploading chunks to Pinecone in batches ...")
    batch_upsert(index, embeddings_with_ids, batch_size=50)

    print(f"\nUploaded {len(chunks)} chunks to Pinecone index '{index_name}'.")

In [None]:
!pip install requests



# Calling the function to upsert all chunks to pinecone

In [None]:
embed_chunks_and_upload_to_pinecone(combined_chunks, PINECONE_INDEX_NAME)


Creating a new index:  index235

Embedding chunks using OpenAI ...

Uploading chunks to Pinecone in batches ...

Uploaded 906 chunks to Pinecone index 'index235'.


In [None]:
model_links