In [1]:
import requests
from bs4 import BeautifulSoup
from numpy import nan

In [2]:
def process_price(price: str) -> float:
    price = price.strip('\t\n €')
    price = price.replace(',', '')
    return float(price)

In [26]:
def process_float(input_number: str) -> float:
    input_number = input_number.strip('\n\t grcm')
    input_number = input_number.replace(',', '.')
    try:
        input_number = float(input_number)
    except ValueError:
        import re
        num_list = re.findall(r'\d+\.?\d+', input_number)
        
        if len(num_list) == 0: input_number = nan
        else: input_number = float(max(num_list))
    return input_number

In [11]:
def scrape_page(url: str) -> dict:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    try: name = soup.find(class_='product-name-h1').text
    except AttributeError: name = nan

    try: price = process_price(soup.find(class_='price').text)
    except AttributeError: price = nan

    try: sex = soup.find(class_='sex').td.text.strip('\n\t ')
    except AttributeError: sex = nan

    try: jewel_type = soup.find(class_='jewel_type').td.text.strip('\n\t ')
    except AttributeError: jewel_type = nan

    try: brand = soup.find(class_='manufacturer_jewel').td.text.strip('\n\t ')
    except AttributeError: 
        try: brand = soup.find(class_='tr3').td.text.strip('\n\t ')
        except AttributeError: brand = nan
    
    try: material = soup.find(class_='material').td.text.strip('\n\t ')
    except AttributeError: material = nan
    
    try: color = soup.find(class_='color').td.text.strip('\n\t ')
    except AttributeError: color = nan

    try: jewel_weight = process_float(soup.find(class_='jewel_weight').td.text)
    except AttributeError: jewel_weight = nan

    try: rocks = soup.find(class_='jewel_rocks2').td.text.strip('\n\t ')
    except AttributeError: rocks = nan

    try: rock_details = soup.find(class_='stone_details').td.text.strip('\n\t ')
    except AttributeError: rock_details = nan

    try: dimensions = soup.find(class_='dimensions').td.text.strip('\n\t ')
    except AttributeError: dimensions = nan

    try: details = soup.find(class_='details').td.text.strip('\n\t ')
    except AttributeError: details = nan

    try: chain_carat = float(soup.find(class_='chain_carat').td.text.strip('\n\t '))
    except AttributeError: chain_carat = nan

    try: chain_length = process_float(soup.find(class_='cm').td.text)
    except AttributeError: chain_length = nan

    try: diameter = process_float(soup.find(class_='diameter').td.text)
    except AttributeError: diameter = nan
        
    return {
        'name': name,
        'price': price,
        'sex': sex,
        'jewel_type': jewel_type,
        'brand': brand,
        'material': material,
        'color': color,
        'jewel_weight': jewel_weight,
        'rocks': rocks,
        'rock_details': rock_details,
        'dimensions': dimensions,
        'details': details,
        'chain_carat': chain_carat,
        'chain_length': chain_length,
        'diameter': diameter
    }

In [5]:
test_url = 'https://www.haritidis.gr/en/ring-misovero-diamond-daxtulidi-diamantia-kv40895'
scrape_page(test_url)

{'name': 'Δαχτυλίδι μισόβερο',
 'price': 2200.0,
 'sex': 'Women',
 'jewel_type': 'Ring',
 'brand': 'Haritidis',
 'material': '18 carat gold',
 'color': 'White',
 'jewel_weight': 3.9,
 'rocks': 'Diamond',
 'rock_details': 'Διαμάντια 0.80ct',
 'dimensions': nan,
 'details': nan,
 'chain_carat': nan,
 'chain_length': nan,
 'diameter': nan}

In [6]:
links = open('links.txt', 'r').readlines()

In [31]:
all_entries = {}

for i in range(3375):
    link = links[i].strip('\n')
    all_entries[i] = scrape_page(link)

In [32]:
import json
with open('raw_data.json', 'w') as fp:
    json.dump(all_entries, fp)