In [6]:
import pandas as pd
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import threading

# Thread-safe sets to track progress
in_progress = set()
completed = set()
lock = threading.Lock()

def scrape_page(page_num):
    # Add to in_progress when starting
    with lock:
        in_progress.add(page_num)
    
    try:
        time.sleep(random.uniform(0.05, 0.1))
        url = "https://www.boliga.dk/salg/resultater?searchTab=1&page=%d&sort=date-d" % page_num
        d = pd.read_html(url)[0]
        print("Page %d done" % page_num)
        
        # Move from in_progress to completed
        with lock:
            in_progress.remove(page_num)
            completed.add(page_num)
        
        return d
    except Exception as e:
        print("Page %d failed: %s" % (page_num, str(e)))
        raise e  

start_time = time.time()
dl = []
pages_completed = 0

try:
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks
        futures = [executor.submit(scrape_page, i) for i in range(28850, 35700)]
        
        # Collect results as they complete
        for future in as_completed(futures):
            result = future.result()  # This will raise exception if page failed
            dl.append(result)
            pages_completed += 1
            
            # Print progress every 100 pages
            if pages_completed % 100 == 0:
                elapsed_time = time.time() - start_time
                pages_per_min = (pages_completed / elapsed_time) * 60
                print(f"Progress: {pages_completed} completed, {len(in_progress)} in progress, {pages_per_min:.1f} pages/min")

except Exception as e:
    print(f"Scraping stopped due to error: {e}")

# Concatenate whatever we got before the error
if dl:
    df = pd.concat(dl, ignore_index=True)
    print(f"Successfully scraped {len(dl)} pages before stopping")
else:
    print("No successful pages scraped")

Page 28857 done
Page 28854 done
Page 28858 done
Page 28855 done
Page 28856 done
Page 28853 done
Page 28852 done
Page 28859 done
Page 28850 done
Page 28851 done
Page 28862 done
Page 28861 done
Page 28864 done
Page 28866 done
Page 28860 done
Page 28863 done
Page 28865 done
Page 28867 done
Page 28868 done
Page 28869 done
Page 28870 done
Page 28871 done
Page 28873 done
Page 28874 done
Page 28877 done
Page 28878 done
Page 28872 done
Page 28876 done
Page 28879 done
Page 28875 done
Page 28880 done
Page 28881 done
Page 28882 done
Page 28884 done
Page 28883 done
Page 28885 done
Page 28888 done
Page 28887 done
Page 28886 done
Page 28889 done
Page 28890 done
Page 28891 done
Page 28892 done
Page 28893 done
Page 28894 done
Page 28895 done
Page 28897 done
Page 28899 done
Page 28896 done
Page 28898 done
Page 28900 done
Page 28902 done
Page 28901 done
Page 28904 done
Page 28903 done
Page 28905 done
Page 28907 done
Page 28906 done
Page 28908 done
Page 28909 done
Page 28910 done
Page 28911 done
Page 289

In [7]:
#print(futures[0].result())
print(in_progress)

{34952, 34954, 35344, 33682, 35346, 33635, 33652}


In [8]:

df = pd.concat(dl, ignore_index=True)

In [None]:
df.head(5)

In [None]:
df.shape

In [9]:
df.to_csv(r'file4.csv', index=False)