In [10]:
import os
from urllib.parse import urlparse

domain = "https://www.grainger.com"
local_domain = urlparse(domain).netloc
# Create necessary directories if they don't exist
if not os.path.exists("text/"):
    os.mkdir("text/")
if not os.path.exists(f"text/{local_domain}/"):
    os.mkdir(f"text/{local_domain}/")
if not os.path.exists("processed"):
    os.mkdir("processed")

In [11]:
import requests
import json


# List of part numbers to query
part_number = "1VCE8"  # Add more part numbers as needed

# Base URL for the endpoint
base_url = "https://mobile-rest-qa.nonprod.graingercloud.com/v1/product/detail"

# File to store the results
file_path = f"text/{local_domain}/{part_number}.txt"

# Headers for the request
headers = {
    "Content-Type": "application/json"
}

# Function to fetch and process data
def fetch_product_details(part_number):
    params = {
        "partNumbers": part_number,
        "extraInfo": "false"
    }
    
    response = requests.get(base_url, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()[0]  # Assuming the response contains a list with one item

        brand = data.get("brand", {}).get("name", "N/A")
        code = data.get("code", "N/A")
        name = data.get("name", "N/A")
        picture_url = data.get("pictureUrl600", "N/A")
        price = data.get("priceData", {}).get("formattedPrice", "N/A")
        description = data.get("productDetailsDescription", "N/A")

        return {
            "Brand": brand,
            "Code": code,
            "Name": name,
            "PictureUrl600": picture_url,
            "Price": price,
            "Description": description
        }
    else:
        return None

# Fetch details for each part number and write to file
with open(file_path, "w") as file:
    details = fetch_product_details(part_number)
    if details:           
        # file.write(json.dumps(details) + "\n")
        print(details)
    else:
        file.write(f"Failed to fetch details for part number: {part_number}\n")

print("Product details have been written to", file_path)


{'Brand': 'DAYTON', 'Code': '1VCE8', 'Name': 'DAYTON Standard-Duty Industrial Fan: 24 in Blade Dia, 2 Speeds, 3,850/6,200 cfm, 115 V AC', 'PictureUrl600': 'https://static.grainger.com/rp/s/is/image/Grainger/1VCF3_AS02?$lgmain$', 'Price': '$474.08', 'Description': '<p>Standard-duty industrial fan heads provide cooling in heavy manufacturing areas and other dusty or dirty environments. These fan blade, motor, and guard assemblies can be paired with a new or existing bracket or base.</p>'}
Product details have been written to text/www.grainger.com/1VCE8.txt


In [12]:
import re

# Pattern to match product skus/codes
regex_pattern = re.compile(r'[A-Z0-9]{5,7}')
# Test strings
test_strings = [
    "1DKW3_1.pdf",
    "3VE59C-Operating-Instructions-and-Parts-Manual.pdf",
    "_3M-Disposable-Respirator-Dual-4JF99?opr=PDPBRDSP&analytics=dsbrItems_5ZZZ6.txt"
]

# Extract product codes from test strings
for test in test_strings:
    matches = regex_pattern.findall(test)
    print(f"Matches in '{test}': {matches}")


Matches in '1DKW3_1.pdf': ['1DKW3']
Matches in '3VE59C-Operating-Instructions-and-Parts-Manual.pdf': ['3VE59C']
Matches in '_3M-Disposable-Respirator-Dual-4JF99?opr=PDPBRDSP&analytics=dsbrItems_5ZZZ6.txt': ['4JF99', 'PDPBRDS', '5ZZZ6']


In [13]:
import os
import re
import json

# Define the regex pattern for product codes
regex_pattern = re.compile(r'[A-Z0-9]{5,7}')

# Directory containing the files
directory = 'GraingerWebScrape/www.grainger.com'

# List to store found product codes
product_codes = []

# Function to extract product codes from text
def extract_product_codes(text):
    return regex_pattern.findall(text)

# Iterate through all files in the directory
for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)

        # Check for product codes in the file name
        codes_in_filename = extract_product_codes(file)
        product_codes.extend(codes_in_filename)

        # Check for product codes in the file content
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                codes_in_content = extract_product_codes(content)
                product_codes.extend(codes_in_content)
        except Exception as e:
            print(f"Could not read file {file_path}: {e}")

# Remove duplicates by converting the list to a set and back to a list
product_codes = list(set(product_codes))

# Save all product codes to a single JSON file
with open('all_product_codes.json', 'w') as f:
    json.dump(product_codes, f, indent=4)

print(f"Total product codes found: {len(product_codes)}")
print(f"Product codes saved in 'all_product_codes.json'")


Could not read file GraingerWebScrape/www.grainger.com/1DKW3_2.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/4NHG9_1.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/1DKW3_3.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/1DKW3_1.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/4NHG9_2.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/TraumaCube__XE7K_v1.pdf: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Could not read file GraingerWebScrape/www.grainger.com/1106-1600_PI_en_US1__JQ1I.pdf: 'utf-8' codec can't 

In [14]:
import requests
import pandas as pd
import os

# Base URL and headers for the API
base_url = "https://mobile-rest-qa.nonprod.graingercloud.com/v1/product/detail"
headers = {
    "Content-Type": "application/json"
}

# Function to fetch and process data
def fetch_product_details(skus):
    params = {
        "partNumbers": skus,
        "extraInfo": "false"
    }
    response = requests.get(base_url, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()[0]  # Assuming the response contains a list with one item
        brand = data.get("brand", {}).get("name", "N/A")
        code = data.get("code", "N/A")
        name = data.get("name", "N/A")
        picture_url = data.get("pictureUrl600", "N/A")
        price = data.get("priceData", {}).get("formattedPrice", "N/A")
        description = data.get("productDetailsDescription", "N/A")

        return {
            "Brand": brand,
            "Code": code,
            "Name": name,
            "PictureUrl600": picture_url,
            "Price": price,
            "Description": description
        }
    else:
        return None

# 
# # DataFrame to store product details
# df = pd.DataFrame(columns=["Brand", "Code", "Name", "PictureUrl600", "Price", "Description"])

# Fetch details for each part number and append to DataFrame
# for part_number in part_numbers:
#     details = fetch_product_details(part_number)
#     if details:
#         df = pd.concat([df, pd.DataFrame([details])], ignore_index=True)
#     else:
#         print(f"Failed to fetch details for part number: {part_number}")

# Save DataFrame to a Parquet file
# df.to_parquet('processed/grainger_products.parquet', index=False)
# 
# 
# print("Product details have been saved to 'processed/grainger_products.parquet'")
# 
# print(df.head())
# print(df.tail())
# print(df.size)
# print(df.values)


In [20]:
import requests
import json

# Load the product codes from the JSON file
with open('all_product_codes.json', 'r') as f:
    product_codes = ["1VCE8", "2KNK4", "20HC15"]
       #json.load(f)

# Product codes in chunks of 100
chunk_size = 100
chunks = [product_codes[i:i + chunk_size] for i in range(0, len(product_codes), chunk_size)]

# Iterate over each chunk for API requests
df = pd.DataFrame(columns=["Brand", "Code", "Name", "PictureUrl600", "Price", "Description"])
for chunk in chunks:
    try:
        details = fetch_product_details(chunk)
        df = pd.concat([df, pd.DataFrame([details])], ignore_index=True)
    except Exception as e:
        print(f"Failed to fetch details for chunk: {chunk}")

print("All product details have been processed.")
# Remove rows where all columns are NaN
df = df.dropna(how='all')
# Ensure all column names are strings
df.columns = df.columns.astype(str)
# Save to Parquet
df.to_parquet('processed/grainger_products.parquet', index=False)
print("Product details have been saved to 'processed/grainger_products.parquet'")
print("\nHead of DataFrame:")
print(df.head(), "\n")
print("Tail of DataFrame:")
print(df.tail(), "\n")
print("Size of DataFrame:", df.size, "\n")
print("Values in DataFrame:")
print(df.values)



All product details have been processed.
Product details have been saved to 'processed/grainger_products.parquet'

Head of DataFrame:
  Brand    Code                        Name  \
0   CRC  20HC15  Window and Display Cleaner   

                                       PictureUrl600   Price Description  
0  https://static.grainger.com/rp/s/is/image/Grai...  $10.20        None   

Tail of DataFrame:
  Brand    Code                        Name  \
0   CRC  20HC15  Window and Display Cleaner   

                                       PictureUrl600   Price Description  
0  https://static.grainger.com/rp/s/is/image/Grai...  $10.20        None   

Size of DataFrame: 6 

Values in DataFrame:
[['CRC' '20HC15' 'Window and Display Cleaner'
  'https://static.grainger.com/rp/s/is/image/Grainger/20HC15_AS01?$lgmain$'
  '$10.20' None]]


In [None]:
# import requests
# import pandas as pd
# import json
# 
# # Base URL and headers for the API
# base_url = "https://mobile-rest-qa.nonprod.graingercloud.com/v1/product/detail"
# headers = {
#     "Content-Type": "application/json"
# }
# 
# # Function to fetch and process data
# def fetch_product_details(skus):
#     params = {
#         "partNumbers": skus,
#         "extraInfo": "false"
#     }
#     response = requests.get(base_url, headers=headers, params=params)
#     if response.status_code == 200:
#         try:
#             data = response.json()
#             results = []
#             for item in data:
#                 brand = item.get("brand", {}).get("name", "N/A")
#                 code = item.get("code", "N/A")
#                 name = item.get("name", "N/A")
#                 picture_url = item.get("pictureUrl600", "N/A")
#                 price = item.get("priceData", {}).get("formattedPrice", "N/A")
#                 description = item.get("productDetailsDescription", "N/A")
#                 
#                 results.append({
#                     "Brand": brand,
#                     "Code": code,
#                     "Name": name,
#                     "PictureUrl600": picture_url,
#                     "Price": price,
#                     "Description": description
#                 })
#                 
#             return pd.DataFrame(results) if results else None
#         except Exception as e:
#             print(f"Error parsing response for {skus}: {e}")
#             return None
#     else:
#         print(f"Failed to fetch details for {skus}: Status code {response.status_code}")
#         return None
# 
# # Load the product codes from the JSON file
# with open('all_product_codes.json', 'r') as f:
#     product_codes = json.load(f)
# 
# # For testing, use a smaller subset of product codes
# # product_codes = ["1VCE8", "2KNK4", "20HC15"]
# 
# # Product codes in chunks of 100
# chunk_size = 1
# chunks = [product_codes[i:i + chunk_size] for i in range(0, len(product_codes), chunk_size)]
# 
# # Iterate over each chunk for API requests
# df = pd.DataFrame(columns=["Brand", "Code", "Name", "PictureUrl600", "Price", "Description"])
# for chunk in chunks:
#     try:
#         details = fetch_product_details(chunk)
#         if details is not None:
#             df = pd.concat([df, details], ignore_index=True)
#         else:
#             print(f"No details fetched for chunk: {chunk}")
#     except Exception as e:
#         print(f"Failed to fetch details for chunk: {chunk}, Error: {e}")
# 
# print("All product details have been processed.")
# 
# # Remove rows where all columns are NaN
# df = df.dropna(how='all')
# 
# # Ensure all column names are strings
# df.columns = df.columns.astype(str)
# 
# # Save to Parquet
# df.to_parquet('processed/grainger_products.parquet', index=False)
# print("Product details have been saved to 'processed/grainger_products.parquet'")
# print("\nHead of DataFrame:")
# print(df.head(), "\n")
# print("Tail of DataFrame:")
# print(df.tail(), "\n")
# print("Size of DataFrame:", df.size, "\n")
# print("Values in DataFrame:")
# print(df.values)


Failed to fetch details for ['182UFW9']: Status code 400
No details fetched for chunk: ['182UFW9']
Failed to fetch details for ['04103']: Status code 400
No details fetched for chunk: ['04103']
Failed to fetch details for ['97429T7']: Status code 400
No details fetched for chunk: ['97429T7']
Failed to fetch details for ['ZEREXH']: Status code 400
No details fetched for chunk: ['ZEREXH']
Failed to fetch details for ['ICHELIN']: Status code 400
No details fetched for chunk: ['ICHELIN']
Failed to fetch details for ['6391H']: Status code 400
No details fetched for chunk: ['6391H']
Failed to fetch details for ['00818HC']: Status code 400
No details fetched for chunk: ['00818HC']
Failed to fetch details for ['40R19']: Status code 400
No details fetched for chunk: ['40R19']
Failed to fetch details for ['ALTERNA']: Status code 400
No details fetched for chunk: ['ALTERNA']
Failed to fetch details for ['FOOTWEA']: Status code 400
No details fetched for chunk: ['FOOTWEA']
Failed to fetch details 