In [1]:
from bs4 import BeautifulSoup
import requests
import json
import queue
import time
import os
from requests.exceptions import ProxyError, HTTPError

In [2]:
walmart_url = "https://www.walmart.com"
OUTPUT_FILE = "product_info.jsonl"

In [3]:
HEADERS = {
    "Accept" : "*/*",
    "Accept-Encoding" : "gzip, deflate, br, zstd",
    "Accept-Language" : "en-US,en;q=0.9",
    "content-length" : "2388",
    "content-type" : "application/x-www-form-urlencoded",
    "DNT": "1",
    "origin" : "https://www.walmart.com",
    "priority" : "u=1, i",
    "Referer" : "https://www.walmart.com/",
    "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
    "sec-fetch-dest" : "empty",
    "sec-fetch-mode" : "cors",
    "sec-fetch-site" : "cross-site"
}


In [4]:
# Bright Data - Create a Proxy - Get the data from Overview
# Documentation - Data Center - Send your first request - Python Code

host = 'brd.superproxy.io'
port = 33335
# username = 'brd-customer-<customer_id>-zone-<zone_name>'
# password = '<zone_password>'

proxy_url = f'http://{username}:{password}@{host}:{port}'

proxies = {
    'http': proxy_url,
    'https': proxy_url
}


In [5]:
search_queries = ["computers", "laptops", "desktops", "monitors", "printers", "hard+drives", "usb", "cords", "cameras",
                  "mouse", "keyboard", "microphones", "speakers", "radio", "tablets", "android", "apple", "watch", "smart+watch",
                  "fridge", "airconditioning", "wifi", "router", "modem", "desk", "xbox", "playstation", "nintendo"]

In [6]:
product_queue = queue.Queue()
seen_urls = set()

In [7]:
def get_product_links(query, page_number):
  search_url = f"https://www.walmart.com/search?q={query}&page={page_number}"
  max_retries = 3
  backoff_factor = 3

  for attempt in range(max_retries):
    try:
      response = requests.get(search_url,headers = HEADERS,proxies = proxies)
      response.raise_for_status()

      soup = BeautifulSoup(response.text, 'html.parser')

      links = soup.find_all('a', href = True)

      product_links = []
      found = False

      for link in links:
        link_href = link['href']
        if "/ip/" in link_href:
          fount = True
          if "https" in link_href:
            full_url = link_href
          else:
            full_url = walmart_url + link_href

          if full_url not in seen_urls:
            product_links.append(full_url)

      if not found:
        print("\n \n \n SOUP WHEN NOT FOUND")

      return product_links

    except ProxyError as e:
      wait_time = backoff_factor ** attempt
      print(f"Proxy error: {e}. Retrying in {wait_time} seconds...")
      time.sleep(wait_time)

    except HTTPError as e:
      if e.response.status_code == 412:
        print(f"Precondition Failed (412): {e}. Skipping URL.")
        break
      wait_time = backoff_factor ** attempt
      print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
      time.sleep(wait_time)

    except Exception as e:
      print(f"Failed to get product links for query: {query} on page: {page_number}. Error: {e}")
      break

  print(f"Skipping query after {max_retries} retries: {query} on page: {page_number}")
  return []

def extract_product_info(product_url):
  print("Processing URL", product_url)
  max_retries = 5
  backoff_factor = 3

  for attempt in range(max_retries):
    try:
      response = requests.get(product_url,headers = HEADERS, proxies = proxies)
      response.raise_for_status()

      soup = BeautifulSoup(response.text, 'html.parser')

      script_tag = soup.find('script', id = "__NEXT_DATA__")

      if script_tag is None:
        return None

      # Converting to json
      data = json.loads(script_tag.string)

      # Getting the data
      initial_data = data['props']['pageProps']['initialData']['data']
      product_data = initial_data['product']
      reviews_data = initial_data.get('reviews', {})

      product_info = {
          'price': product_data['priceInfo']['currentPrice']['price'],
          'review_count': reviews_data.get('totalReviewCount', 0),
          'item_id' : product_data['usItemId'],
          'avg_ratings' : reviews_data.get('averageOverallRating', 0),
          'product_name' : product_data['name'],
          'brand' : product_data.get('brand',""),
          'availablity' : product_data['availabilityStatus'],
          'image_url' : product_data['imageInfo']['thumbnailUrl'],
          'short_description' : product_data.get('shortDescription',"")
      }

      return product_info

    except ProxyError as e:
      wait_time = backoff_factor ** attempt
      print(f"Proxy error: {e}. Retrying in {wait_time} seconds...")
      time.sleep(wait_time)

    except HTTPError as e:
      if e.response.status_code == 412:
        print(f"Precondition Failed (412): {e}. Skipping URL.")
        break
      wait_time = backoff_factor ** attempt
      print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
      time.sleep(wait_time)

    except Exception as e:
      print(f"Failed to process URL: {product_url}. Error: {e}")
      break

  print(f"Skipping URL after {max_retries} retries: {product_url}")
  return None


def main():
  with open(OUTPUT_FILE, 'w') as file:
    while search_queries:
      current_query = search_queries.pop(0)
      print("\n \n CURRENT QUERY", current_query,"\n\n")
      page_number = 1

    while True:
      product_links = get_product_links(current_query,page_number)
      if not product_links or page_number > 99:
        break

      for link in product_links:
        if link not in seen_urls:
          product_queue.put(link)
          seen_urls.add(link)

      while not product_queue.empty():
        product_url = product_queue.get()
        product_info = extract_product_info(product_url)
        if product_info:
          file.write(json.dumps(product_info) + '\n')

      page_number += 1
      print(page_number)


if __name__ == "__main__":
  main()


 
 CURRENT QUERY computers 



 
 CURRENT QUERY laptops 



 
 CURRENT QUERY desktops 



 
 CURRENT QUERY monitors 



 
 CURRENT QUERY printers 



 
 CURRENT QUERY hard+drives 



 
 CURRENT QUERY usb 



 
 CURRENT QUERY cords 



 
 CURRENT QUERY cameras 



 
 CURRENT QUERY mouse 



 
 CURRENT QUERY keyboard 



 
 CURRENT QUERY microphones 



 
 CURRENT QUERY speakers 



 
 CURRENT QUERY radio 



 
 CURRENT QUERY tablets 



 
 CURRENT QUERY android 



 
 CURRENT QUERY apple 



 
 CURRENT QUERY watch 



 
 CURRENT QUERY smart+watch 



 
 CURRENT QUERY fridge 



 
 CURRENT QUERY airconditioning 



 
 CURRENT QUERY wifi 



 
 CURRENT QUERY router 



 
 CURRENT QUERY modem 



 
 CURRENT QUERY desk 



 
 CURRENT QUERY xbox 



 
 CURRENT QUERY playstation 



 
 CURRENT QUERY nintendo 



 
 
 SOUP WHEN NOT FOUND
Processing URL https://www.walmart.com/ip/Nintendo-Entertainment-System-NES-Classic-Edition-Original-Comes-with-2-Controllers/6828461990?classType=REGULAR&at