In [119]:
import pandas as pd
import ray
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
from tqdm import tqdm

In [111]:
PATH_URLS = "datahuck_sample.csv"

urls_df = pd.read_csv(PATH_URLS, index_col="seed_url")

In [131]:
class Crawler:
    def __init__(self, output_path="output.csv"):
        self.output_path = output_path
        self.errors_path = f"errors_{output_path}"

        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.3'}

        self.num_crawled = 0 # crawled is defined as succesfully extracted relevant information
        self.num_errors = 0
    
    def __str__(self):
        return f'Crawler has so far processed [{self.num_crawled}] urls and failed on [{self.num_errors}] urls'
    
    def process_url(self, url):
        # TODO check Here
        url = url.replace("https", "http")
        try:
            response = requests.get(url, headers=self.headers)

            response.raise_for_status()  # Check if the request was successful

            soup = BeautifulSoup(response.content, 'html.parser')
            # Find all <a> tags with an href attribute
            links = soup.find_all('a', href=True)
            
            # Extract the href attribute from each <a> tag and store it in a list
            hrefs = [link['href'] for link in links]
            hrefs = [urljoin(url, link['href']) for link in links]
            hrefs = [link.replace(urlparse(link).scheme + ':', 'http:') for link in hrefs]

            # To see all internal links
            # print([l for l in hrefs if url in l])

            hrefs = [urlparse(link).path.strip("/") for link in hrefs if url in link]
            hrefs = list(set([l for l in hrefs if len(l) > 0]))

            self.num_crawled += 1

            return {"url": url, "results": hrefs, "error?": False}

        except requests.RequestException as e:
            self.num_errors += 1
            return {"url": url, "results": e, "error?": True}

    def batch_process(self, urls, batch_size=10, alias="ALL"):
        """
        Process a urls in batches

        urls: pd.DataFrame of urls
        batch_size: size of batch
        alias: When parallel processing give each instance a unique identitifier so intermediate results don't clash
        """
        num_batches = len(df) // batch_size + (len(df) % batch_size > 0)
        processed_data = []
        print(f'Processing {alias} with batch size: {batch_size} and number of batches: {num_batches}')

        for i in tqdm(range(num_batches)):
            batch = urls.iloc[i * batch_size:(i + 1) * batch_size]
            for seed_url in batch.index:
                processed_data.append(self.process_url(seed_url))
                
            # Save intermediate results
            intermediate_df = pd.DataFrame(processed_data)
            intermediate_df.to_csv(f'{alias}_intermediate_results.csv', index=False)

        # Save final results
        final_df = pd.DataFrame(processed_data)
        final_df.to_csv(f'{alias}_{self.output_path}', index=False)
        print(self)
        return final_df

In [132]:
c = Crawler()
c.batch_process(urls_df.head(100))

Processing ALL with batch size: 10 and number of batches: 100


100%|██████████| 100/100 [04:16<00:00,  2.57s/it]

Crawler has so far processed [95] urls and failed on [5] urls





Unnamed: 0,url,results,error?
0,http://nataliehuggins.com,"[media, yoga, yoga-classes-workshops, store, a...",False
1,http://fauxynaturalhaircare.com,"[collections/all, products/shampoo, products/d...",False
2,http://winkeyless.kr,"[product/bold-case-mini, forums/topic/possible...",False
3,http://anna-goodman.com,"[collections/chakra-set, cdn/shop/products/ANN...",False
4,http://darrenbooth.com,"[info, work, newsletter, anobjectofbeauty, let...",False
...,...,...,...
95,http://www.chimytina.com,"[videos, donate, contact, about, press, cart]",False
96,http://moisturelove.com,"[pages/reviews, products/stimulating-growth-oi...",False
97,http://www.rauchstone.com,[],False
98,http://melhodge.com,"[home/project-six-pnpma, home/project-one-pxrz...",False


In [136]:
import concurrent.futures

segments = p1, p2  = urls_df.iloc[0:100], urls_df.iloc[100: 200]

# Create instances of Crawler for each DataFrame
crawler_instances = [Crawler() for _ in segments]

# Function to wrap batch_process method
def process_with_crawler(crawler, df, alias):
    return crawler.batch_process(df, alias=alias)

with concurrent.futures.ThreadPoolExecutor(max_workers=len(segments)) as executor:
    futures = [executor.submit(process_with_crawler, crawler, df, f"DF{i+1}") for i, (crawler, df) in enumerate(zip(crawler_instances, segments))]
    
    # Wait for all futures to complete (optional)
    for future in concurrent.futures.as_completed(futures):
        pass  # We are ignoring the results

Processing DF1 with batch size: 10 and number of batches: 100
Processing DF2 with batch size: 10 and number of batches: 100


  4%|▍         | 4/100 [02:05<49:59, 31.24s/it]

In [40]:

# Initialize Ray
ray.init()

# Define a function to extract links from a single URL
@ray.remote
def extract_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')
        links = [a.get('href') for a in soup.find_all('a', href=True)]
        return links
    except requests.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return []

# Process URLs in batches
batch_size = 10
all_results = []

for i in range(0, len(df), batch_size):
    batch_urls = df.index[i:i + batch_size]
    results = ray.get([extract_links.remote(url) for url in batch_urls])
    all_results.extend(results)

# Combine results and create a new DataFrame
df['links'] = all_results

# Print the DataFrame with extracted links
print(df)

# Shut down Ray
ray.shutdown()


2024-07-25 15:30:45,046	INFO worker.py:1788 -- Started a local Ray instance.


[36m(extract_links pid=25065)[0m Request failed for http://tedxrexburg.com: 406 Client Error: Not Acceptable for url: http://tedxrexburg.com/
[36m(extract_links pid=25070)[0m Request failed for http://goslg.com: 406 Client Error: Not Acceptable for url: http://goslg.com/


[33m(raylet)[0m [2024-07-25 15:30:54,994 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18624638976; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25074)[0m Request failed for http://poopieandpooper.com: 404 Client Error: Not Found for url: http://poopieandpooper.com/
[36m(extract_links pid=25067)[0m Request failed for http://wiserpiercing.com: 404 Client Error: Not Found for url: http://wiserpiercing.com/


[33m(raylet)[0m [2024-07-25 15:31:05,093 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18623680512; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://thehotcommodityboutique.com: 403 Client Error: Forbidden for url: http://thehotcommodityboutique.com/


[33m(raylet)[0m [2024-07-25 15:31:15,187 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627596288; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:31:25,283 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627579904; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:31:35,383 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627305472; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:31:45,475 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 1862

[36m(extract_links pid=25066)[0m Request failed for http://www.johnsotomusic.com: Exceeded 30 redirects.
[36m(extract_links pid=25066)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/
[36m(extract_links pid=25074)[0m Request failed for http://www.dreamingofachance.com: 406 Client Error: Not Acceptable for url: http://www.dreamingofachance.com/


[33m(raylet)[0m [2024-07-25 15:32:05,572 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18633330688; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:32:15,667 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18632851456; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://www.rauchstone.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
[36m(extract_links pid=25067)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/


[33m(raylet)[0m [2024-07-25 15:32:25,758 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18632552448; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25066)[0m Request failed for http://golfsteelcity.com: 403 Client Error: Forbidden for url: http://golfsteelcity.com/


[33m(raylet)[0m [2024-07-25 15:32:35,759 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18629910528; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25065)[0m Request failed for http://softdeluxeusa.com: 404 Client Error: Not Found for url: http://softdeluxeusa.com/
[36m(extract_links pid=25066)[0m Request failed for http://linkageresearch.com: 406 Client Error: Not Acceptable for url: http://linkageresearch.com/
[36m(extract_links pid=25065)[0m Request failed for http://theroyalpets.co: 402 Client Error: Payment Required for url: https://theroyalpets.co/


[33m(raylet)[0m [2024-07-25 15:32:45,857 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18640367616; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25068)[0m Request failed for http://redmerledesigns.com: 404 Client Error: Not Found for url: http://redmerledesigns.com/
[36m(extract_links pid=25072)[0m Request failed for http://beilersdoughnuts.com: 406 Client Error: Not Acceptable for url: http://beilersdoughnuts.com/


[33m(raylet)[0m [2024-07-25 15:32:55,945 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18639990784; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:33:06,043 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18639650816; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25066)[0m Request failed for http://lockamekey.com: 402 Client Error: Payment Required for url: https://lockamekey.com/
[36m(extract_links pid=25070)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/
[36m(extract_links pid=25072)[0m Request failed for http://ruggednature.co.uk: 404 Client Error: Not Found for url: http://ruggednature.co.uk/
[36m(extract_links pid=25066)[0m Request failed for http://www.blingbandsport.com: 403 Client Error: Forbidden for url: http://www.blingbandsport.com/
[36m(extract_links pid=25071)[0m Request failed for http://www.nicolemera.com: 403 Client Error: Forbidden for url: http://nicolemera.com/


[33m(raylet)[0m [2024-07-25 15:33:16,047 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18639609856; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/
[36m(extract_links pid=25069)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/
[36m(extract_links pid=25073)[0m Request failed for http://blackownednj.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
[36m(extract_links pid=25071)[0m Request failed for http://national-bolt.com: 403 Client Error: Forbidden for url: https://www.national-bolt.com/


[33m(raylet)[0m [2024-07-25 15:33:26,054 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18639491072; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:33:36,059 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18639220736; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25074)[0m Request failed for http://www.ig-ns.org: 403 Client Error: Forbidden for url: http://ig-ns.org/


[33m(raylet)[0m [2024-07-25 15:33:46,160 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18639196160; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://customturfoutlet.com: 403 Client Error: Forbidden for url: http://customturfoutlet.com/


[33m(raylet)[0m [2024-07-25 15:33:56,260 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18638458880; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25068)[0m Request failed for http://thequailbarn.com: 406 Client Error: Not Acceptable for url: http://thequailbarn.com/
[36m(extract_links pid=25071)[0m Request failed for http://howmanysyrians.com: 500 Server Error: Internal Server Error for url: http://howmanysyrians.com/
[36m(extract_links pid=25074)[0m Request failed for http://winsomedesign.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
[36m(extract_links pid=25069)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/


[33m(raylet)[0m [2024-07-25 15:34:06,358 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18638909440; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:34:16,456 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18638897152; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25068)[0m Request failed for http://islandfishermanmagazine.com: 403 Client Error: Forbidden for url: http://islandfishermanmagazine.com/


[33m(raylet)[0m [2024-07-25 15:34:26,554 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18638868480; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:34:36,554 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18638548992; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://cultivateflx.com: 406 Client Error: Not Acceptable for url: http://cultivateflx.com/
[36m(extract_links pid=25066)[0m Request failed for http://magiccarwashinc.com: 406 Client Error: Not Acceptable for url: http://magiccarwashinc.com/
[36m(extract_links pid=25065)[0m Request failed for http://www.cobbtuning.com: 403 Client Error: Forbidden for url: http://www.cobbtuning.com/
[36m(extract_links pid=25065)[0m Request failed for http://ekosaja.woodway-shop.com: 403 Client Error: Forbidden for url: https://ekosaja.woodway-shop.com/


[33m(raylet)[0m [2024-07-25 15:34:46,556 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18637500416; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25073)[0m Request failed for http://seeinggodshand.co: 402 Client Error: Payment Required for url: https://seeinggodshand.co/
[36m(extract_links pid=25071)[0m Request failed for http://saxklyfe.com: 402 Client Error: Payment Required for url: https://saxklyfe.com/


[33m(raylet)[0m [2024-07-25 15:34:56,555 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636894208; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:35:06,561 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636648448; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:35:16,659 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636603392; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:35:26,754 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 1863

[36m(extract_links pid=25067)[0m Request failed for http://shop.dutchbros.com: 403 Client Error: Forbidden for url: https://shop.dutchbros.com/
[36m(extract_links pid=25072)[0m Request failed for http://bkreativegc.myshopify.com: 402 Client Error: Payment Required for url: https://bkreativegc.myshopify.com/
[36m(extract_links pid=25073)[0m Request failed for http://kippot4less.com: 403 Client Error: Forbidden for url: http://kippot4less.com/
[36m(extract_links pid=25069)[0m Request failed for http://www.plusgami.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
[36m(extract_links pid=25073)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/


[33m(raylet)[0m [2024-07-25 15:35:36,852 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636263424; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:35:46,951 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18640556032; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25069)[0m Request failed for http://www.stsidedetailing.com: 404 Client Error: Not Found for url: http://www.stsidedetailing.com/


[33m(raylet)[0m [2024-07-25 15:35:56,953 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18639364096; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25074)[0m Request failed for http://retone.nl: HTTPConnectionPool(host='retone.nl', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10689dc90>: Failed to resolve 'retone.nl' ([Errno 8] nodename nor servname provided, or not known)"))
[36m(extract_links pid=25070)[0m Request failed for http://humangarage.net: 403 Client Error: Forbidden for url: https://humangarage.net/


[33m(raylet)[0m [2024-07-25 15:36:07,050 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636541952; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:36:17,145 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636541952; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25066)[0m Request failed for http://wacf.org: 403 Client Error: Forbidden for url: https://www.wacf.org/


[33m(raylet)[0m [2024-07-25 15:36:27,244 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636509184; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:36:37,340 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18635075584; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:36:47,433 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18635030528; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:36:57,525 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 1863

[36m(extract_links pid=25072)[0m Request failed for http://nauticawebshop.com: HTTPSConnectionPool(host='nauticawebshop.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x115f61a90>, 'Connection to nauticawebshop.com timed out. (connect timeout=None)'))


[33m(raylet)[0m [2024-07-25 15:37:58,097 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18636185600; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:38:08,193 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18633945088; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://magikalelite.com: 404 Client Error: Not Found for url: http://magikalelite.com/
[36m(extract_links pid=25073)[0m Request failed for http://flowerofcarmelrosaries.com: 406 Client Error: Not Acceptable for url: http://flowerofcarmelrosaries.com/
[36m(extract_links pid=25066)[0m Request failed for http://superiordaily.com: 404 Client Error: Not Found for url: https://superiordaily.com/
[36m(extract_links pid=25070)[0m Request failed for http://sheladesigns.com: 406 Client Error: Not Acceptable for url: http://sheladesigns.com/


[33m(raylet)[0m [2024-07-25 15:38:18,290 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18633834496; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://infowarsteam.com: 403 Client Error: Forbidden for url: http://infowarsteam.com/
[36m(extract_links pid=25074)[0m Request failed for http://adasportsandrackets.com: 403 Client Error: Forbidden for url: https://adasportsandrackets.com/
[36m(extract_links pid=25066)[0m Request failed for http://dmarkperformance.com: 406 Client Error: Not Acceptable for url: https://dmarkperformance.com/


[33m(raylet)[0m [2024-07-25 15:38:28,388 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18633768960; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://adam-ny.com: 406 Client Error: Not Acceptable for url: http://adam-ny.com/
[36m(extract_links pid=25071)[0m Request failed for http://www.natural-wonder-pets.com: 403 Client Error: Forbidden for url: http://www.natural-wonder-pets.com/
[36m(extract_links pid=25072)[0m Request failed for http://justicebrewscoffee.com: HTTPConnectionPool(host='justicebrewscoffee.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x110f62e90>: Failed to resolve 'justicebrewscoffee.com' ([Errno 8] nodename nor servname provided, or not known)"))
[36m(extract_links pid=25069)[0m Request failed for http://spicnspancarpet.com: 406 Client Error: Not Acceptable for url: http://spicnspancarpet.com/


[33m(raylet)[0m [2024-07-25 15:38:38,484 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18633555968; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25070)[0m Request failed for http://www.roomstogo.com: 403 Client Error: Forbidden for url: https://www.roomstogo.com/


[33m(raylet)[0m [2024-07-25 15:38:48,581 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18637701120; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:38:58,674 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18635366400; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:39:08,773 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18634956800; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25074)[0m Request failed for http://www.julx.co.uk: HTTPConnectionPool(host='www.julx.co.uk', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x107db8250>: Failed to resolve 'www.julx.co.uk' ([Errno 8] nodename nor servname provided, or not known)"))
[36m(extract_links pid=25074)[0m Request failed for http://journeyprintz.com: HTTPConnectionPool(host='journeyprintz.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x107e05010>: Failed to resolve 'journeyprintz.com' ([Errno 8] nodename nor servname provided, or not known)"))


[33m(raylet)[0m [2024-07-25 15:39:18,865 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18634911744; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25070)[0m Request failed for http://www.katm.in: 406 Client Error: Not Acceptable for url: http://www.katm.in/
[36m(extract_links pid=25068)[0m Request failed for http://covecutlery.com: HTTPSConnectionPool(host='covecutlery.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'covecutlery.com'. (_ssl.c:1006)")))
[36m(extract_links pid=25067)[0m Request failed for http://narahita.com: 444 Client Error:  for url: http://narahita.com/


[33m(raylet)[0m [2024-07-25 15:39:28,959 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18634780672; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25073)[0m Request failed for http://estartshop.com: 404 Client Error: Not Found for url: http://estartshop.com/
[36m(extract_links pid=25066)[0m Request failed for http://wheelworx.net: 403 Client Error: Forbidden for url: https://wheelworx.net/
[36m(extract_links pid=25074)[0m Request failed for http://gcountyapparel.com: 406 Client Error: Not Acceptable for url: http://gcountyapparel.com/


[33m(raylet)[0m [2024-07-25 15:39:39,056 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18634244096; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25074)[0m Request failed for http://mycils.fr: 404 Client Error: Not Found for url: http://mycils.fr/


[33m(raylet)[0m [2024-07-25 15:39:49,147 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18634158080; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25069)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/
[36m(extract_links pid=25066)[0m Request failed for http://labohemenouv.com: 402 Client Error: Payment Required for url: https://labohemenouv.com/


[33m(raylet)[0m [2024-07-25 15:39:59,244 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18631860224; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25065)[0m Request failed for http://myoceanstyle.com: 402 Client Error: Payment Required for url: https://myoceanstyle.com/


[33m(raylet)[0m [2024-07-25 15:40:09,342 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18630344704; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25073)[0m Request failed for http://duyanebeauty.com: 406 Client Error: Not Acceptable for url: http://duyanebeauty.com/


[33m(raylet)[0m [2024-07-25 15:40:19,342 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18630340608; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25068)[0m Request failed for http://jeepbeef.com: 403 Client Error: Forbidden for url: https://www.jeepbeef.com/
[36m(extract_links pid=25072)[0m Request failed for http://thecraftybutlers.com: 406 Client Error: Not Acceptable for url: http://thecraftybutlers.com/
[36m(extract_links pid=25069)[0m Request failed for http://www.poliso3d.com: 406 Client Error: Not Acceptable for url: http://www.poliso3d.com/
[36m(extract_links pid=25073)[0m Request failed for http://buyerarea.co.uk: HTTPSConnectionPool(host='www.buyerarea.co.uk', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))


[33m(raylet)[0m [2024-07-25 15:40:29,437 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18630176768; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:40:39,437 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18629140480; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/
[36m(extract_links pid=25074)[0m Request failed for http://mineralmafia.com: 404 Client Error: Not Found for url: http://mineralmafia.com/
[36m(extract_links pid=25071)[0m Request failed for http://pineandsapling.com: 403 Client Error: Forbidden for url: http://pineandsapling.com/
[36m(extract_links pid=25073)[0m Request failed for http://www.dailysmartdeals.com: HTTPSConnectionPool(host='www.dailysmartdeals.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1006)')))


[33m(raylet)[0m [2024-07-25 15:40:49,534 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18629128192; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25066)[0m Request failed for http://bifrostfarms.com: 403 Client Error: Forbidden for url: https://bifrostfarms.com/
[36m(extract_links pid=25074)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/
[36m(extract_links pid=25071)[0m Request failed for http://thechuffedstore.com: 403 Client Error: Forbidden for url: http://thechuffedstore.com/
[36m(extract_links pid=25070)[0m Request failed for http://bombaycrystal.com: HTTPSConnectionPool(host='bombaycrystal.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1006)')))


[33m(raylet)[0m [2024-07-25 15:40:59,627 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18628227072; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25070)[0m Request failed for http://madsprt.com: 404 Client Error: Not Found for url: https://madsprt.com/
[36m(extract_links pid=25067)[0m Request failed for http://westernstatestool.com: 403 Client Error: Forbidden for url: http://westernstatestool.com/
[36m(extract_links pid=25066)[0m Request failed for http://apollotea.com: 406 Client Error: Not Acceptable for url: http://apollotea.com/
[36m(extract_links pid=25070)[0m Request failed for http://www.crashsports.org: 406 Client Error: Not Acceptable for url: http://www.crashsports.org/


[33m(raylet)[0m [2024-07-25 15:41:09,725 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627129344; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://dinosandtiaras.com: HTTPConnectionPool(host='dinosandtiaras.com', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x1160ad9d0>: Failed to resolve 'dinosandtiaras.com' ([Errno 8] nodename nor servname provided, or not known)"))
[36m(extract_links pid=25071)[0m Request failed for http://www.thanksgiving.org: 403 Client Error: Forbidden for url: http://thanksgiving.org/
[36m(extract_links pid=25065)[0m Request failed for http://chamberscosmetics.com: 404 Client Error: Not Found for url: http://chamberscosmetics.com/


[33m(raylet)[0m [2024-07-25 15:41:19,821 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627055616; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://www.dollargeneral.com: 403 Client Error: Forbidden for url: http://www.dollargeneral.com/


[33m(raylet)[0m [2024-07-25 15:41:29,911 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18626916352; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25070)[0m Request failed for http://hmhasports.com: 406 Client Error: Not Acceptable for url: http://hmhasports.com/
[36m(extract_links pid=25067)[0m Request failed for http://www.jcpenney.com: 403 Client Error: Forbidden for url: http://www.jcpenney.com/
[36m(extract_links pid=25074)[0m Request failed for http://yd.org: 403 Client Error: Forbidden for url: http://yd.org/


[33m(raylet)[0m [2024-07-25 15:41:40,012 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625486848; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25073)[0m Request failed for http://www.garicleadership.com: 406 Client Error: Not Acceptable for url: http://www.garicleadership.com/


[33m(raylet)[0m [2024-07-25 15:41:50,107 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625417216; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25067)[0m Request failed for http://kitchenforeveryoneyork.org: 403 Client Error: Forbidden for url: https://kitchenforeveryoneyork.org/


[33m(raylet)[0m [2024-07-25 15:42:00,196 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625421312; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://lovebirth.co: 404 Client Error: Not Found for url: https://lovebirth.co/
[36m(extract_links pid=25068)[0m Request failed for http://diamonds-beautyline.com: 510 Server Error: Not Extended for url: https://diamonds-beautyline.com/
[36m(extract_links pid=25072)[0m Request failed for http://emirates-house.com: 406 Client Error: Not Acceptable for url: http://emirates-house.com/


[33m(raylet)[0m [2024-07-25 15:42:10,287 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18619793408; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25065)[0m Request failed for http://thehotcommodityboutique.com: 403 Client Error: Forbidden for url: http://thehotcommodityboutique.com/
[36m(extract_links pid=25071)[0m Request failed for http://trimfootco.com: 403 Client Error: Forbidden for url: https://trimfootco.com/


[33m(raylet)[0m [2024-07-25 15:42:20,384 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18620178432; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://shopevolveera.com: 402 Client Error: Payment Required for url: https://shopevolveera.com/
[36m(extract_links pid=25069)[0m Request failed for http://solekickz2.com: 403 Client Error: Forbidden for url: http://solekickz2.com/
[36m(extract_links pid=25073)[0m Request failed for http://eliavintage.com: 404 Client Error: Not Found for url: https://www.eliavintage.com/


[33m(raylet)[0m [2024-07-25 15:42:30,482 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18618068992; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25065)[0m Request failed for http://focusman.co.uk: 402 Client Error: Payment Required for url: https://focusman.co.uk/


[33m(raylet)[0m [2024-07-25 15:42:40,482 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18615648256; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25068)[0m Request failed for http://digitalgaget.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


[33m(raylet)[0m [2024-07-25 15:42:50,577 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18615390208; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25069)[0m Request failed for http://www.gunguardusa.com: HTTPSConnectionPool(host='www.gunguardusa.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1006)')))
[36m(extract_links pid=25070)[0m Request failed for http://shop.by.bioeticgoods.com: 402 Client Error: Payment Required for url: https://shop.by.bioeticgoods.com/


[33m(raylet)[0m [2024-07-25 15:43:00,670 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625236992; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25065)[0m Request failed for http://ionacraftshop.com: 406 Client Error: Not Acceptable for url: http://ionacraftshop.com/


[33m(raylet)[0m [2024-07-25 15:43:10,764 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18628616192; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://www.jonathanweisstours.com: 406 Client Error: Not Acceptable for url: http://www.jonathanweisstours.com/
[36m(extract_links pid=25072)[0m Request failed for http://youblob.com: HTTPSConnectionPool(host='youblob.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'youblob.com'. (_ssl.c:1006)")))


[33m(raylet)[0m [2024-07-25 15:43:20,853 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627948544; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25071)[0m Request failed for http://westerlyandco.nz: 402 Client Error: Payment Required for url: https://westerlyandco.nz/


[33m(raylet)[0m [2024-07-25 15:43:30,943 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18628890624; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25066)[0m Request failed for http://www.orientalperfumery.be: HTTPConnectionPool(host='www.orientalperfumery.be', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x113b963d0>: Failed to resolve 'www.orientalperfumery.be' ([Errno 8] nodename nor servname provided, or not known)"))
[36m(extract_links pid=25067)[0m Request failed for http://www.maainteriors.com.au: HTTPConnectionPool(host='www.maainteriors.com.au', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x1058c2190>: Failed to resolve 'www.maainteriors.com.au' ([Errno 8] nodename nor servname provided, or not known)"))
[36m(extract_links pid=25070)[0m Request failed for http://whistlingduckfarm.com: 403 Client Error: Forbidden for url: http://whistlingduckfarm.com/
[36m(extract_links pid=25074)[0m Request failed for http://courtneyharper.com: 404 Client Erro

[33m(raylet)[0m [2024-07-25 15:43:41,030 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18628239360; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:43:51,125 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627973120; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:44:01,223 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18629971968; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:44:11,316 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 1862

[36m(extract_links pid=25070)[0m Request failed for http://paprkut.com: 402 Client Error: Payment Required for url: https://paprkut.com/
[36m(extract_links pid=25066)[0m Request failed for http://hffi.org: 403 Client Error: Forbidden for url: http://hffi.org/


[33m(raylet)[0m [2024-07-25 15:44:31,499 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18627629056; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25072)[0m Request failed for http://forwardthinkingag.com: 404 Client Error: Not Found for url: https://www.forwardthinkingag.com/
[36m(extract_links pid=25071)[0m Request failed for http://lovelikewhiskey.com: 404 Client Error: Not Found for url: http://lovelikewhiskey.com/
[36m(extract_links pid=25065)[0m Request failed for http://nightingale.com: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


[33m(raylet)[0m [2024-07-25 15:44:41,596 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625908736; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:44:51,834 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625630208; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:45:01,923 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18628308992; capacity: 494384795648. Object creation will fail if spilling is required.
[33m(raylet)[0m [2024-07-25 15:45:12,015 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 1862

[36m(extract_links pid=25073)[0m Request failed for http://bestbuytrading.in: HTTPConnectionPool(host='bestbuytrading.in', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x107fe3b50>: Failed to establish a new connection: [Errno 64] Host is down'))


[33m(raylet)[0m [2024-07-25 15:45:32,201 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625855488; capacity: 494384795648. Object creation will fail if spilling is required.


[36m(extract_links pid=25067)[0m Request failed for http://www.dawnbrolin.com: 403 Client Error: Forbidden for url: http://www.dawnbrolin.com/
[36m(extract_links pid=25069)[0m Request failed for http://www.kool4u.com.au: HTTPConnectionPool(host='www.kool4u.com.au', port=80): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x110b3a310>: Failed to resolve 'www.kool4u.com.au' ([Errno 8] nodename nor servname provided, or not known)"))


[33m(raylet)[0m [2024-07-25 15:45:42,293 E 25061 28995314] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2024-07-25_15-30-43_457913_24735 is over 95% full, available space: 18625880064; capacity: 494384795648. Object creation will fail if spilling is required.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['links'] = all_results


                                                                             links
seed_url                                                                          
http://nataliehuggins.com        [#page, /, /about-1, /epk, /lessons, /yoga, /e...
http://fauxynaturalhaircare.com  [#MainContent, /cart, /collections/all, /, /, ...
http://winkeyless.kr             [#site-navigation, #content, https://winkeyles...
http://anna-goodman.com          [#MainContent, /, /collections, /collections/c...
http://darrenbooth.com           [/cart, #page, /, /work, /illustration, /lette...
...                                                                            ...
http://invernoscent.com          [#site-main, /, #, #, #, /account/login, /cart...
http://siobeauty.com             [#main-content, /collections/smoothing-silicon...
http://simonclaridge.com         [/, /cart, /search, /, /new, /shop, /about, /s...
http://carpediemmarkers.com      [/, /register?returnUrl=%2F, /login?returnUrl=...
http

In [44]:
def extract_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')
        links = [a.get('href') for a in soup.find_all('a', href=True)]
        return links
    except requests.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return []
# 402 error
# https://paprkut.com/

# 403 error 
# http://www.dawnbrolin.com

# 404 error
# http://lovelikewhiskey.com

# 406 error
# http://ionacraftshop.com

# 500 error (unsolvable)
# http://howmanysyrians.com

# 444 error
# http://narahita.com

# connection aborted
# http://www.plusgami.com

# Max retries (unsolvable)
# http://lovelikewhiskey.com

