In [1]:
import os
from urllib.parse import urlparse

domain = "https://www.grainger.com"
local_domain = urlparse(domain).netloc
# Create necessary directories if they don't exist
if not os.path.exists("text/"):
    os.mkdir("text/")
if not os.path.exists(f"text/{local_domain}/"):
    os.mkdir(f"text/{local_domain}/")
if not os.path.exists("processed"):
    os.mkdir("processed")

In [2]:
# TEST 
import re

# Pattern to match product skus/codes
regex_pattern = re.compile(r'[A-Z0-9]{5,7}')
# Test strings
test_strings = [
    "1DKW3_1.pdf",
    "3VE59C-Operating-Instructions-and-Parts-Manual.pdf",
    "_3M-Disposable-Respirator-Dual-4JF99?opr=PDPBRDSP&analytics=dsbrItems_5ZZZ6.txt"
]

# Extract product codes from test strings
for test in test_strings:
    matches = regex_pattern.findall(test)
    print(f"Matches in '{test}': {matches}")


Matches in '1DKW3_1.pdf': ['1DKW3']
Matches in '3VE59C-Operating-Instructions-and-Parts-Manual.pdf': ['3VE59C']
Matches in '_3M-Disposable-Respirator-Dual-4JF99?opr=PDPBRDSP&analytics=dsbrItems_5ZZZ6.txt': ['4JF99', 'PDPBRDS', '5ZZZ6']


In [13]:
# PULL product codes from web scraped data and save as json
import os
import re
import json

# Define the regex pattern for product codes
regex_pattern = re.compile(r'[A-Z0-9]{5,7}')

# Directory containing the files
directory = 'GraingerWebScrape/www.grainger.com'

# List to store found product codes
product_codes = []

# Function to extract product codes from text
def extract_product_codes(text):
    return regex_pattern.findall(text)

# Iterate through all files in the directory
for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)

        # Check for product codes in the file name
        codes_in_filename = extract_product_codes(file)
        product_codes.extend(codes_in_filename)

        # Check for product codes in the file content
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                codes_in_content = extract_product_codes(content)
                product_codes.extend(codes_in_content)
        except Exception as e:
            print(f"Could not read file {file_path}: {e}")

# Remove duplicates by converting the list to a set and back to a list
product_codes = list(set(product_codes))

# Save all product codes to a single JSON file
with open('all_product_codes.json', 'w') as f:
    json.dump(product_codes, f, indent=4)

print(f"Total product codes found: {len(product_codes)}")
print(f"Product codes saved in 'all_product_codes.json'")


Could not read file GraingerWebScrape/www.grainger.com/1DKW3_2.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/4NHG9_1.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/1DKW3_3.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/1DKW3_1.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/4NHG9_2.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/TraumaCube__XE7K_v1.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/1106-1600_PI_en_US1__JQ1I.pdf: 'utf-8' codec can't 

In [None]:
# FETCH DATA ON PRODUCT CODES FROM URL AND SAVE AS DATA FRAME
# UPDATE THE JSON FILE WITH VALID PRODUCT CODES
import os
import requests
import pandas as pd
import json

# Base URL and headers for the API
base_url = "https://mobile-rest-qa.nonprod.graingercloud.com/v1/product/detail"
headers = {
    "Content-Type": "application/json"
}

# Function to fetch and process data
def fetch_product_details(skus):
    params = {
        "partNumbers": skus,
        "extraInfo": "false"
    }
    response = requests.get(base_url, headers=headers, params=params)
    if response.status_code == 200:
        try:
            data = response.json()
            results = []
            for item in data:
                brand = item.get("brand", {}).get("name", "N/A")
                code = item.get("code", "N/A")
                name = item.get("name", "N/A")
                picture_url = item.get("pictureUrl600", "N/A")
                price = item.get("priceData", {}).get("formattedPrice", "N/A")
                description = item.get("productDetailsDescription", "N/A")

                results.append({
                    "Brand": brand,
                    "Code": code,
                    "Name": name,
                    "PictureUrl600": picture_url,
                    "Price": price,
                    "Description": description
                })

            return pd.DataFrame(results) if results else None
        except Exception as e:
            print(f"Error parsing response for {skus}: {e}")
            return None
    else:
        print(f"Failed to fetch details for {skus}: Status code {response.status_code}")
        return None

# Load the product codes from the JSON file
with open('all_product_codes.json', 'r') as f:
    product_codes = json.load(f)

print(f"Total product codes found: {len(product_codes)}")

# Product codes in chunks of 100
chunk_size = 1
chunks = [product_codes[i:i + chunk_size] for i in range(0, len(product_codes), chunk_size)]

# Iterate over each chunk for API requests
df = pd.DataFrame(columns=["Brand", "Code", "Name", "PictureUrl600", "Price", "Description"])
failed_chunks = []
for chunk in chunks:
    try:
        details = fetch_product_details(chunk)
        if details is not None:
            df = pd.concat([df, details], ignore_index=True)
        else:
            print(f"No details fetched for chunk: {chunk}. Removing from source.")
            failed_chunks.extend(chunk)
    except Exception as e:
        print(f"Failed to fetch details for chunk: {chunk}, Error: {e}")
        failed_chunks.extend(chunk)

# # Remove failed product codes from the source list
# product_codes = [code for code in product_codes if code not in failed_chunks]

# # Save the updated product codes to the JSON file
# with open('all_product_codes.json', 'w') as f:
#     json.dump(product_codes, f, indent=4)

# # Remove rows where all columns are NaN
# df = df.dropna(how='all')

# Ensure all column names are strings
df.columns = df.columns.astype(str)

# Save to Parquet
os.makedirs('processed', exist_ok=True)
df.to_parquet('processed/grainger_products.parquet', index=False)
print("Product details have been saved to 'processed/grainger_products.parquet'")
print("\nHead of DataFrame:")
print(df.head(), "\n")
print("Tail of DataFrame:")
print(df.tail(), "\n")
print("Size of DataFrame:", df.size, "\n")
print("Values in DataFrame:")
print(df.values)


Total product codes found: 3237
Failed to fetch details for ['1A912']: Status code 404
No details fetched for chunk: ['1A912']. Removing from source.
Failed to fetch details for ['39R838']: Status code 404
No details fetched for chunk: ['39R838']. Removing from source.
Failed to fetch details for ['4KN42']: Status code 404
No details fetched for chunk: ['4KN42']. Removing from source.
Failed to fetch details for ['447Y46']: Status code 404
No details fetched for chunk: ['447Y46']. Removing from source.
Failed to fetch details for ['404K04']: Status code 404
No details fetched for chunk: ['404K04']. Removing from source.
Failed to fetch details for ['60NT66']: Status code 404
No details fetched for chunk: ['60NT66']. Removing from source.
Failed to fetch details for ['12Y499']: Status code 404
No details fetched for chunk: ['12Y499']. Removing from source.
Failed to fetch details for ['493R87']: Status code 404
No details fetched for chunk: ['493R87']. Removing from source.
Failed to fe