  # Using GPT API to find the best product for a given query

We need to install openai library in case we don't have it (i.e. Google Colab doesn't have openai by default)

In [None]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
import requests
from bs4 import BeautifulSoup
import openai

--------------------------------------------------------------------------------------------------------
<h1>Enter parameters here!</h1>

In [None]:
openai.api_key = 'Your API key here'
n_items=5
search_query="portable radio"

-----------------------------------------------------------------------------------------------

In [None]:
def scrape_amazon_product_page(url):
    # send a GET request to the url and get the HTML content
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    'referer': 'https://google.com',
    }
    response = requests.get(url,headers=headers)

    # create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # get the name of the product
    try:
        name = soup.find('span', {'id': 'productTitle'}).text.strip()
    except:
        name = None

    # get the price of the product
    try:
        price = soup.find('span', {'class': 'a-price-whole'}).text.strip()
    except:
        price = None
    try:
        price_fraction = soup.find('span', {'class': 'a-price-fraction'}).text.strip()
    except:
        price_fraction = None
    # get the category of the product
    try:
        category = soup.find('a', {'class': 'a-link-normal a-color-tertiary'}).text.strip()
    except:
        category = None

    # get the link of the product
    link = url

    # get the delivery time of the product
    try:
        delivery_time = soup.find('div', {'id': 'ddmDeliveryMessage'}).text.strip()
    except:
        delivery_time = None

    # get the rating of the product
    try:
        rating = soup.find('span', {'class': 'a-icon-alt'}).text.strip()
    except:
        rating = None

    # get the rating count of the product
    try:
        rating_count = soup.find('span', {'id': 'acrCustomerReviewText'}).text.strip()
    except:
        rating_count = None
    if not rating_count:
        rating=None

    # create a dictionary with the extracted information
    try:
        product_description = soup.find('div', {'id': 'productDescription'}).text.strip()

    except:
        product_description = None
    try:
      details_div = soup.find('div', {'id': 'productDetails_feature_div'})
      rows = details_div.find_all('tr')

      # create a dictionary to store the information
      product_info = ""

      # iterate over the rows and extract the information
      for row in rows:
          # extract the label and value from the row
          label = row.find('th').text.strip()
          value = row.find('td').text.strip()
          
          # add the information to the product_info variable
          product_info += f"{label}: {value}\n"

    except:
      product_info=""

    # extract the "about" section
    try:
      about_div = soup.find('div', {'id': 'feature-bullets'})
      about_text = ""

      if about_div is not None:
          # find all the ul elements inside the about_div
          ul_list = about_div.find_all('ul', class_='a-unordered-list')
          # iterate over the ul elements and find the one containing the "About this item" header
          for ul in ul_list:
              header = ul.find_previous('h1')
              if header is not None and header.get_text(strip=True) == 'About this item':
                  # iterate over the li elements and concatenate their text
                  for li in ul.find_all('li'):
                      about_text += li.get_text(strip=True) + "\n"
                  break  # exit the loop if we found the correct ul element
    except:
      about_text = ""
      
    product_info = {
        'name': name,
        'price': price,
        'price_fraction':price_fraction,
        'category': category,
        'link': link,
        'delivery time': delivery_time,
        'description':product_description,
        'information': product_info,
        'rating': rating,
        'rating count': rating_count,
        'about_text':about_text
    }

    # return the dictionary
    return product_info

In [None]:
def search_amazon(product_name):
    URL = f"https://www.amazon.ca/s?k={product_name}&ref=nb_sb_noss_2"
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    'referer': 'https://google.com',
    }
    # HTTP Request
    webpage = requests.get(URL, headers=headers)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")
    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
    # Store the links
    links_dict={}
    # Loop for extracting links from Tag Objects
    for link in links:
        try:
            id=link.get('href').split("/dp/")[1].split("/")[0]
            links_dict[link.get('href').split("/dp/")[0]]=f"https://www.amazon.ca/dp/{id}"
        except:
            continue
    # Return the results list
    return links_dict

In [None]:
def ask_gpt(system,text, model,  max_tokens):
    messages = [{"role": "system",
                 "content":system},
                {"role": "user",
                "content": text}]
    
    response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                max_tokens=max_tokens,
                temperature=0.1
              )
    return response["choices"][0]["message"]["content"].strip()

In [None]:
#Got the summarizer idea from AutoGPT
#https://github.com/Significant-Gravitas/Auto-GPT

def summarize_text(text):
    """Summarize text using the GPT-3.5 Turbo model"""
    def split_text(text, max_length=8192):
        paragraphs = text.split("\n")
        current_length = 0
        current_chunk = []

        for paragraph in paragraphs:
            if current_length + len(paragraph) + 1 <= max_length:
                current_chunk.append(paragraph)
                current_length += len(paragraph) + 1
            else:
                yield "\n".join(current_chunk)
                current_chunk = [paragraph]
                current_length = len(paragraph) + 1

        if current_chunk:
            yield "\n".join(current_chunk)


    # Split the text into chunks
    chunks = list(split_text(text))

    # Generate a summary for each chunk
    summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i + 1} / {len(chunks)}")
        summaries.append(ask_gpt("You are a summarizer that summarizes the given text. Do not include any other information in your response. Maintain the original style of the text in your response.",chunk,"gpt-3.5-turbo",300))

    # Combine the summaries into a final summary
    final_summary = "\n".join(summaries)

    return final_summary


In [None]:
#A class that shows HTML content in an IPython Notebook
class html_resolve():
    def __init__(self,HTML_response):
        if '<table' not in HTML_response:
            self.text= (HTML_response)
        else:
            self.text= """<style>
            table, th, td {
              border: 1px solid black;
            }
          </style>"""+HTML_response
    def _repr_html_(self):
        return self.text


In [None]:
search_results = search_amazon(search_query)

In [None]:
scrape_results={}
n_items_initial=n_items
for product_title,product_url in search_results.items():
  n_items-=1
  product_info = scrape_amazon_product_page(product_url)
  scrape_results[product_title]=product_info
  print(f"{n_items_initial-n_items}/{n_items_initial} Results for {product_title} saved.")
  if n_items==0:
    break

1/5 Results for /Panasonic-Portable-Radio-Silver-RF-2400 saved.
2/5 Results for /FosPower-Emergency-Portable-Household-Flashlight saved.
3/5 Results for /Sony-Portable-Radio-Audio-ICF506 saved.
4/5 Results for /Portable-Reception-Battery-Operated-Earphone saved.
5/5 Results for /Sony-ICF-P27-Portable-Radio-Speaker saved.


In [None]:
i=1
required_info={}
for product_title,product_info in scrape_results.items():
  long_text=f"Product Description:{product_info['description']}\n Product Information: {product_info['information']}\n About Section: {product_info['about_text']}"
  summary = summarize_text(long_text)
  sumarized_info_dict=dict((k, product_info[k]) for k in ('name', 'price', 'price_fraction', 'link', 'delivery time', 'rating', 'rating count'))
  sumarized_info_dict["information"]=summary
  required_info[product_title]=sumarized_info_dict
  print(f"{i}/{n_items_initial} Results for {product_title} summarized.")
  i+=1


Summarizing chunk 1 / 1
1/5 Results for /Panasonic-Portable-Radio-Silver-RF-2400 summarized.
Summarizing chunk 1 / 1
2/5 Results for /FosPower-Emergency-Portable-Household-Flashlight summarized.
Summarizing chunk 1 / 1
3/5 Results for /Sony-Portable-Radio-Audio-ICF506 summarized.
Summarizing chunk 1 / 1
4/5 Results for /Portable-Reception-Battery-Operated-Earphone summarized.
Summarizing chunk 1 / 1
5/5 Results for /Sony-ICF-P27-Portable-Radio-Speaker summarized.


In [None]:
HTML_response=ask_gpt(f"You are a product reviewer. You suggest products between a selection based on their properties and the user's inital query, which is {search_query}. Summarize long names and avoid long descriptions. You only respond in a way that is parsable by HTML.",f"Create a comparison table for the given products, including columns for price, link, a column named 'Unique Properties' which explains what sets them apart from the others, and rating, ranked by preference. include only very short names. Only respond in HTML.\n\n\n"+str(required_info),"gpt-3.5-turbo",1200)

In [None]:
html_resolve(HTML_response)

Name,Price,Link,Unique Properties,Rating
Panasonic Portable Radio,$44.99,Link,"AC/DC, large band indicator, fold-down handle",4.5
FosPower Emergency Radio,$39.90,Link,"Solar charging, hand crank, power bank, NOAA emergency broadcasts",4.4
Sony ICF-506 Portable Radio,$59.99,Link,"Analog tuning, carrying handle, earphone output",4.5
LEOTEC Portable AM FM Radio,$34.99,Link,"Retractable antenna, big speaker, clear dial, earphone jack",4.3
Sony ICF-P27 Portable Radio,$39.99,Link,"Hand strap, LED tuning indicator, headphone jack",4.3


In [None]:
HTML_response=ask_gpt(f"You are a product reviewer. You suggest products between a selection based on their properties and the user's inital query, which is {search_query}. Summarize long names and avoid long descriptions. You only respond in a way that is parsable by HTML.",f"For each product, please provide a brief opinion (verdict) and rank them in order of preference. At the end, tell me which one is the best option and why. Only respond in HTML.\n\n\n"+str(required_info),"gpt-3.5-turbo",1200)

In [None]:
html_resolve(HTML_response)