# Extracting all the Product Links

In [None]:
import requests
import xml.etree.ElementTree as ET
import csv
from urllib.parse import urlsplit, urlunsplit

# URL of the sitemap
url = "https://www.4sgm.com/siteMapProduct_0.xml"
# url = "https://www.4sgm.com/siteMapProduct_0.xml?language=es"

# Fetch the XML
response = requests.get(url)
response.raise_for_status()

# Parse the XML
root = ET.fromstring(response.content)

# Extract all <loc> tags
namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
raw_links = [loc.text for loc in root.findall(".//ns:loc", namespace)]

# Remove ?language=es from each link
clean_links = []
for link in raw_links:
    parts = urlsplit(link)
    # Drop query parameters (like ?language=es)
    clean_link = urlunsplit((parts.scheme, parts.netloc, parts.path, "", ""))
    clean_links.append(clean_link)

# Save to CSV
output_file = "sitemap_links_clean.csv"
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Link"])  # header
    for link in clean_links:
        writer.writerow([link])

print(f"✅ Extracted {len(clean_links)} cleaned links and saved to {output_file}")


✅ Extracted 44356 cleaned links and saved to sitemap_links_clean.csv


# Extracting UPC, Price, Case Info

In [None]:
import pandas as pd

# Load your CSV
df = pd.read_csv("sitemap_links_clean.csv")

# Number of parts
num_parts = 20

# Rows per part (ceil division)
rows_per_part = (len(df) + num_parts - 1) // num_parts  

# Split and save
for i in range(num_parts):
    start = i * rows_per_part
    end = min((i + 1) * rows_per_part, len(df))
    df_part = df.iloc[start:end]
    df_part.to_csv(f"part_{i+1}.csv", index=False)
    print(f"Saved part_{i+1}.csv with {len(df_part)} rows")


In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

def extract_product_data(url):
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    data = {"url": url, "UPC": None, "Casepack Dimension": None, "Price": None, "Casepack": None}

    # Extract UPC + Casepack Dimension
    spec_blocks = soup.find_all("div", class_="clearfix")
    for block in spec_blocks:
        title = block.find("div", class_="spec_title")
        info = block.find("div", class_="spec_info")
        if not title or not info:
            continue
        title_text = title.get_text(strip=True)
        info_text = info.get_text(strip=True)
        if "UPC Number" in title_text:
            data["UPC"] = info_text
        elif "Casepack Dimension" in title_text:
            data["Casepack Dimension"] = info_text

    # Extract Price & Casepack
    price_wrapper = soup.find("div", class_="price_wrapper")
    if price_wrapper:
        price_span = price_wrapper.find("span", class_="price")
        casepack_span = price_wrapper.find("span", class_="casepack")
        if price_span:
            data["Price"] = price_span.get_text(strip=True)
        if casepack_span:
            data["Casepack"] = casepack_span.get_text(strip=True)

    return data


def main():
    # Read input.csv (must contain a column 'Link')
    input_file = "part_1.csv"
    output_file = "output.csv"

    df = pd.read_csv(input_file)
    results = []
    total = len(df)

    for idx, url in enumerate(df["Link"], start=1):
        try:
            data = extract_product_data(url)
            results.append(data)
            print(f"Extracting {idx}/{total}, UPC: {data['UPC']}")
        except Exception as e:
            print(f"Extracting {idx}/{total}, UPC: ❌ (Failed: {e})")
            results.append({"url": url, "UPC": None, "Casepack Dimension": None, "Price": None, "Casepack": None})

    # Save to CSV
    pd.DataFrame(results).to_csv(output_file, index=False)
    print(f"\n📂 Saved results to {output_file}")


if __name__ == "__main__":
    main()


Extracting 1/2218, UPC: None
Extracting 2/2218, UPC: None
Extracting 3/2218, UPC: None
Extracting 4/2218, UPC: None
Extracting 5/2218, UPC: None
Extracting 6/2218, UPC: None
Extracting 7/2218, UPC: None
Extracting 8/2218, UPC: None
Extracting 9/2218, UPC: None
Extracting 10/2218, UPC: None
Extracting 11/2218, UPC: 758266300098
Extracting 12/2218, UPC: None
Extracting 13/2218, UPC: None
Extracting 14/2218, UPC: None
Extracting 15/2218, UPC: None
Extracting 16/2218, UPC: None
Extracting 17/2218, UPC: None
Extracting 18/2218, UPC: None
Extracting 19/2218, UPC: None
Extracting 20/2218, UPC: None
Extracting 21/2218, UPC: None
Extracting 22/2218, UPC: None
Extracting 23/2218, UPC: None
Extracting 24/2218, UPC: None
Extracting 25/2218, UPC: None
Extracting 26/2218, UPC: None
Extracting 27/2218, UPC: None
Extracting 28/2218, UPC: None
Extracting 29/2218, UPC: None
Extracting 30/2218, UPC: None
Extracting 31/2218, UPC: None
Extracting 32/2218, UPC: None
Extracting 33/2218, UPC: None
Extracting 