In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

In [2]:
urls = {
    f'page{page}': f'https://www.amazon.com/s?k=diamond+rings&page={page}&ref=sr_pg_{page}'
    for page in range(1, 8)
}

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1"
}

In [4]:
session = requests.Session()
session.headers.update(headers)

In [5]:
response = session.get(urls['page1'])
print(response.status_code)

200


In [6]:
# Pattern matching lists
design_style = ['toy', 'aluminum balloon', 'cleaner', 'nose ring', 'nose stud', 'silicone rubber', 'silicone rings', 'cake topper', 'tool', 'little girl rhinestone', 'jewelry box', 'pearl napkin',
                'decoration', 'glitter', 'engraving', 'party confetti', 'sterling silver necklace', 'adjustable', 'art keychain', 'ring holders', 'silicone', 'tattoos', 'pendant necklace',
                '4-prong', 'solitaire', 'anniversary', 'eternity band', 'engagement', 'wedding band', 'eternity', 'promise', 'bridal set ',
                'drop', 'flower', 'bezel', 'heart', 'knot', 'twisted stacking', 'infinite', 'natural', 'round & marquise cut',
                'initial', 'love', 'criss cross', 'silver cluster', 'gold band', 'hip hop', 'accent cluster', 'accent',
                'birthstone', 'ring set', 'sterling silver', 'band ring', 'solid gold', 'swarovski']

categories = ['ring']
metal_colors = ['10k white gold plated', '10k yellow gold plated', '10k rose gold plated', '10k gold plated',
                '14k white gold plated', '14k yellow gold plated', '14k rose gold plated', '14k gold plated',
                '18k white gold plated', '18k yellow gold plated', '18k rose gold plated', '18k gold plated',
                'white gold plated', 'yellow gold plated', 'rose gold plated', 'gold plated', 'platinum plated', 'gold over sterling silver',
                'white gold', 'yellow gold', 'rose gold', 'gold', 'platinum', 'silver', 'rhodium', 'stainless steel']
metal_types = ['10k', '14k', '18k', 'pt', '925', 'stainless steel', 'rhodium', 'platinum']
stone_types = ['ruby', 'moissanite', 'lab grown diamond', 'lab created diamond', 'simulated diamond', 'cubic zirconia', 'zirconia', 'zircon', 'simulated black diamond',
               'black diamond', 'diamond', 'cz', 
               'birthstone sapphire', 'crystal', 'black agate', 'white opal', 'amethyst', 'jeulia', 'silver wide band']
shapes = ['round', 'princess', 'princess-cut', 'emerald', 'oval', 'cushion', 'pear', 'marquise', 'heart', 'radiant']

In [7]:
# Extraction function
def extract_info(title):
    title_lower = title.lower()
        
    design = next((d for d in design_style if d in title_lower), 'N/A')
    category = next((c for c in categories if c in title_lower), 'N/A')
    color = next((co for co in metal_colors if co in title_lower), 'N/A')
    metal = next((m for m in metal_types if m in title_lower), 'N/A')
    stone = next((g for g in stone_types if g in title_lower), 'N/A')
    shape = next((s for s in shapes if s in title_lower), 'N/A')

    # Brand detection
    brand = title.strip().split()[0] if title.strip() else 'N/A'

    # Check if the first word is numeric or a code (e.g., 14k, 1.5ct, 1/2ct, etc.)
    brand = title.strip().split()[0] if title.strip() else 'N/A'
    if re.match(r'^(\d+|\d*\.\d+|\d+/\d+)(ct|ctw|carat|k|kt|mm|pcs|cttwdiamond)?$', brand.lower()):
        brand = 'N/A'
        

     # Carat pattern: range, single value, fractions with units
    carat_matches = re.findall(
        r'(\d+(?:\.\d+)?(?:/\d+(?:\.\d+)?)?(?:\s*[-–~to]+\s*\d+(?:\.\d+)?(?:/\d+(?:\.\d+)?)?)?\s*(?:ctw|ct|carat|cttw|ctw))',
        title_lower
    )

    carat = ", ".join(dict.fromkeys([m.strip() for m in carat_matches])) if carat_matches else "N/A"

    return brand, design, category, color, metal, stone, shape, carat


    
    # Carat range detection
    #carat_range = re.search(r'(\d+(?:\.\d+)?|\d+/\d+)\s*(ctw|ct|carat)?[\s\-–to]+(\d+(?:\.\d+)?|\d+/\d+)\s*(ctw|ct|carat)?', title_lower)
    #if carat_range:
        #range_str = f"{carat_range.group(1)} to {carat_range.group(3)}"
        #return brand, design, category, color, metal, stone, shape, range_str

    # Carat individual values
    #matches = re.findall(r'(?:(\d+)\s*x\s*)?(\d+(?:\.\d+)?|\d+/\d+)\s*(ctw|ct|carat)', title_lower)
    #carat_values = []
    #for qty, val, _ in matches:
        #carat_values.append(f"{qty} x {val}" if qty else val)

    #carat = ", ".join(carat_values) if carat_values else 'N/A'
    #return brand, design, category, color, metal, stone, shape, carat


In [8]:
# List to store product data
products = []

In [9]:
# Scrape each page
for page, url in urls.items():
    try:
        res = session.get(url)
        print(f"Scraping {page} - Status: {res.status_code}")
        soup = BeautifulSoup(res.content, "html.parser")

        for item in soup.select(".s-main-slot .s-result-item"):
            asin = item.get("data-asin", "N/A")
            title_tag = item.select_one("h2 span")
            price_tag = item.select_one(".a-price .a-offscreen")
            star_rating_tag = item.select_one(".a-icon-alt")
            global_rating_tag = item.select_one(".a-size-base.s-underline-text")
            image_elem = item.select_one("img.s-image")

            title = title_tag.get_text(strip=True) if title_tag else "N/A"
            price = price_tag.get_text(strip=True).replace('$', '') if price_tag else "N/A"
            star_rating = star_rating_tag.get_text(strip=True).split(' out')[0] if star_rating_tag else "N/A"
            global_rating = global_rating_tag.get_text(strip=True) if global_rating_tag else "N/A"
            image_url = image_elem['src'] if image_elem and 'src' in image_elem.attrs else "N/A"
            product_url = f"https://www.amazon.com/dp/{asin}" if asin != "N/A" else "N/A"

            brand, design, category, color, metal, stone, shape, carat = extract_info(title)

            products.append({
                "ASIN": asin,
                "Title": title,
                "Brand Name": brand,
                "Design Style": design,
                "Category": category,
                "Metal Color": color,
                "Metal Type": metal,
                "Stone Type": stone,
                "Stone Shape": shape,
                "Carat": carat,
                "Price": price,
                "Star Rating": star_rating,
                "Global Rating": global_rating,
                "Image URL": image_url,
                "Product URL": product_url
            })


        time.sleep(random.uniform(1, 2))

    except Exception as e:
        print(f"Error scraping {page}: {e}")
        continue

Scraping page1 - Status: 200
Scraping page2 - Status: 200
Scraping page3 - Status: 200
Scraping page4 - Status: 200
Scraping page5 - Status: 200
Scraping page6 - Status: 200
Scraping page7 - Status: 200


In [10]:
# Optional: Convert to DataFrame
df = pd.DataFrame(products)
print(df.head())

         ASIN                                              Title Brand Name  \
0                                                            N/A        N/A   
1  B09L4KQ2BZ  FRIENDLY DIAMONDS Lab Grown Diamond Ring For W...   FRIENDLY   
2  B0DQ2GPS8M  Le Vian 3/4 or 1 1/2 Carat Chocolate Diamond H...         Le   
3  B0DWHNT11Q  IGI Certified 2 1/2 Carat Emerald Cut Lab Grow...        IGI   
4  B0DK495YMS  IGI Certified 3 1/4 Carat Oval Lab Grown Diamo...        IGI   

  Design Style Category Metal Color Metal Type         Stone Type Stone Shape  \
0          N/A      N/A         N/A        N/A                N/A         N/A   
1    solitaire     ring  white gold        14k  lab grown diamond         N/A   
2  anniversary     ring  white gold        14k            diamond       heart   
3  anniversary     ring  white gold        14k  lab grown diamond     emerald   
4  anniversary     ring  white gold        14k  lab grown diamond        oval   

                Carat     Price Star R

In [11]:
# 1. Drop rows without a Title
df.columns = df.columns.str.strip()  # Clean column names
df_cleaned = df.dropna(subset=["Title"]).copy()

# 2. Fill missing values in feature columns with 'Unknown'
feature_cols = ["Design Style", "Category", "Metal Color", "Metal Type", "Stone Type", "Stone Shape"]
df_cleaned[feature_cols] = df_cleaned[feature_cols].fillna("Unknown")

# Combine Metal Type and Metal Color into a new column called 'Metal Info'
df_cleaned["Metal Info"] = df_cleaned["Metal Type"] + " / " + df_cleaned["Metal Color"]

# 3. Clean and convert 'Price' to float
df_cleaned["Price"] = df_cleaned["Price"].replace(r"[^\d.]", "", regex=True)
df_cleaned["Price"] = pd.to_numeric(df_cleaned["Price"], errors="coerce")
usd_to_hkd_rate = 7.8
df_cleaned["Price_HKD"] = (df_cleaned["Price"] * usd_to_hkd_rate).round(2)

# 4. Extract numeric 'Star Rating'
df_cleaned["Star Rating"] = df_cleaned["Star Rating"].str.extract(r"(\d+(\.\d+)?)")[0]
df_cleaned["Star Rating"] = pd.to_numeric(df_cleaned["Star Rating"], errors="coerce")

# 5. Clean and convert 'Global Rating'
df_cleaned["Global Rating"] = df_cleaned["Global Rating"].str.replace(",", "")
df_cleaned["Global Rating"] = pd.to_numeric(df_cleaned["Global Rating"], errors="coerce")

#6. Clean and convert 'carat'
df_cleaned["Carat"] = df_cleaned["Carat"].apply(lambda x: f'="{x}"' if isinstance(x, str) and '/' in x else x)

In [12]:
# Save cleaned dataset
df_cleaned.to_csv("amazon_ring_new.csv", index=False)