In [74]:
import csv
import json
import pandas as pd # type: ignore
import re
import random
import requests # type: ignore
import urllib.request
from xml.etree import ElementTree as ET
from urllib.parse import urlparse
from urllib.parse import urlencode
import concurrent.futures
import os
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [75]:
def extract_numbers(class_name):
    return re.findall(r'\d+', class_name)

def extract_key(url):
    parsed_url = urlparse(url)
    path_components = parsed_url.path.strip("/").split("/")
    # Use the first component of the path as the key
    return path_components[0]

# Function to extract category information from a webpage
def category_info(link,category_id_list):
    try: 
        session = requests.Session()
        retry_strategy = Retry(
            total=3,  # Number of retries
            status_forcelist=[429, 500, 502, 503, 504],  # HTTP statuses to retry on
            allowed_methods=["HEAD", "GET", "OPTIONS"]  # Methods to retry
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)
        source = session.get(link, timeout=10)  # Adjust timeout as needed
        source.raise_for_status()
        content = source.content

        # Parse the HTML content
        soup = BeautifulSoup(content, 'html.parser')

        # Dictionary to store category details
        category_dict = {}
        

        # Find the breadcrumbs div
        category_info = soup.find('div', class_='breadcrumbs')

        # Extract class names starting with 'category'
        class_names =[]
        for tag in soup.select('div.breadcrumbs [class]'):
            for cls in tag['class']:
                if cls.startswith('category'):
                    class_names.append(cls)

        # Iterate over class names to extract category details
        for category in class_names:
            

            # Extract the category ID using the helper function
            category_id = int(extract_numbers(category)[0]) if extract_numbers(category) else None
            if category_id not in category_id_list:
                category_id_list.append(category_id)
            
                # Find the corresponding list item with the category class

                category_label = category_info.find('li', class_=f"item {category}")
                # Determine the category link
                if category_label and category_label.find('a') and category_label.find('a')["href"]:
                    category_link = category_label.find('a')["href"]
                    category_name = category_label.find('a').text.strip() 
                else:
                    category_link = link  # Fallback to the main link if no category link found
                    category_name = extract_key(link)
                # Extract category name
            

            # Prepare category details
                category_detail = {
                    'category_link': category_link,
                    'category_id': category_id
                }

    
            # Add to dictionary if not already present
                if category_name not in category_dict:
                    category_dict[category_name] = category_detail

        return category_dict
            
    except Exception as e:
        print(f"Error processing {link}: {e}")
        return None

In [76]:
xml_files =['https://www.glamira.com/media/sitemap/glus/category_provider.xml']
links = []
category_dict = {}
category_id_list = []

for xml_file in xml_files:
    response = requests.get(xml_file)
    xml_content = response.content
    root =ET.fromstring(xml_content)
    
    for element in root.iter():
        if element.text is not None and element.text.startswith("https:") and "carat" not in element.text:
            links.append(element.text)
            print(element.text)



https://www.glamira.com/promise-rings/
https://www.glamira.com/diamond-rings/
https://www.glamira.com/cufflinks/
https://www.glamira.com/kids-bracelet/
https://www.glamira.com/kids-necklace/
https://www.glamira.com/kids-rings/
https://www.glamira.com/kids-earrings/
https://www.glamira.com/men/
https://www.glamira.com/women/
https://www.glamira.com/kids/
https://www.glamira.com/engagement-and-wedding-rings/
https://www.glamira.com/diamond-necklaces/
https://www.glamira.com/diamond-earrings/
https://www.glamira.com/engagement-rings/
https://www.glamira.com/diamond-men-ring/
https://www.glamira.com/initial-name-jewelry/
https://www.glamira.com/bracelets/
https://www.glamira.com/bridal-set/
https://www.glamira.com/wedding-set/
https://www.glamira.com/brooch/
https://www.glamira.com/colliers/
https://www.glamira.com/wedding-rings/
https://www.glamira.com/women-ring/
https://www.glamira.com/men-ring/
https://www.glamira.com/plain-design-rings/
https://www.glamira.com/plain-design-necklaces/


In [77]:
final_category_dict = {}
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(category_info, link, category_id_list) for link in links]

    for future in as_completed(futures):
        result = future.result()
        if result is not None:
            final_category_dict.update(result)

In [78]:
# Save results into a final category dictionary
print(len(final_category_dict))

211


In [79]:
# Save final_category_dict to JSON file
output_file = 'category_details.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_category_dict, f, ensure_ascii=False, indent=4)

print(f"Category details saved to {output_file}")

Category details saved to category_details.json
