In [1]:
# %%
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import time

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# %%
# Function to get all links from a webpage
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True)]
        return links
    except Exception as e:
        print(f"Error retrieving links from {url}: {e}")
        return []

In [3]:
# %%
# Function to check if a page contains washer or dryer keywords
def contains_inventory_keywords(page_content):
    inventory_keywords = ['washer', 'dryer']
    for keyword in inventory_keywords:
        if keyword.lower() in page_content.lower():
            return True
    return False

In [4]:
# %%
# Function to crawl the website and find washer and dryer inventory links
def find_washer_dryer_inventory(base_url):
    # Get all links from the base URL
    base_links = get_links(base_url)

    # Filter links that might lead to washer and dryer inventory
    inventory_links = [link for link in base_links if contains_inventory_keywords(link)]

    # Display the filtered inventory links
    print("Potential Washer and Dryer Inventory Links:")
    for link in inventory_links:
        print(urljoin(base_url, link))
    
    return inventory_links

In [5]:
# %%
# Function to check if a page contains washer or dryer keywords and a price element
def contains_inventory_and_price(link):
    inventory_keywords = ['washer', 'dryer']
    price_keywords = ['$', 'price']

    # Perform request and parse
    try:
        response = requests.get(link)
        page_content_raw = BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        print(f"Error retrieving page content from {link}: {e}")
        return False
    
    page_content_lower = str(page_content_raw).lower()

    for keyword in inventory_keywords:
        if keyword.lower() in page_content_lower:
            # Check for the presence of a price-related element
            for price_keyword in price_keywords:
                if price_keyword.lower() in page_content_lower:
                    return True
    return False

In [6]:
# %%
# Function to crawl the website and find washer and dryer inventory links with prices
def find_washer_dryer_inventory_with_prices(lead_links):
    # Fetch page content and filter links that might lead to washer and dryer inventory with prices
    inventory_links = [link for link in lead_links if contains_inventory_and_price(link)]

    # Display the filtered inventory links
    print("Potential Washer and Dryer Inventory Links with Prices:")
    for link in inventory_links:
        print(link)

    return inventory_links

In [7]:
# %%
# Function to crawl the website and find washer and dryer inventory links with prices
def find_washer_dryer_inventory_with_prices(lead_links):
    # Fetch page content and filter links that might lead to washer and dryer inventory with prices
    inventory_links = [link for link in lead_links if contains_inventory_and_price(link)]

    # Display the filtered inventory links
    print("Potential Washer and Dryer Inventory Links with Prices:")
    for link in inventory_links:
        print(link)

    return inventory_links

In [8]:
# %%
# Function to crawl the website and find washer and dryer inventory links with prices
def find_washer_dryer_inventory_with_prices(lead_links):
    # Fetch page content and filter links that might lead to washer and dryer inventory with prices
    inventory_links = [link for link in lead_links if contains_inventory_and_price(link)]

    # Display the filtered inventory links
    print("Potential Washer and Dryer Inventory Links with Prices:")
    for link in inventory_links:
        print(link)

    return inventory_links


In [9]:
# %%
# Function to save DataFrame to CSV with timestamp in the filename
def save_to_csv(dataframe):
    # Get local system time for naming the file
    current_time = time.localtime()
    
    # Format the file name with year, month, day, hour, and minute
    file_name = f"crawl-{current_time.tm_year}-{current_time.tm_mon}-{current_time.tm_mday}_{current_time.tm_hour}-{current_time.tm_min}.csv"
    
    # Save DataFrame to CSV
    dataframe.to_csv(file_name, index=False)
    print(f"Data saved to {file_name}")

In [13]:
# Create a new DataFrame for crawl results
columns = ['name', 'place_id', 'website', 'inventory_link', 'monitor']
crawl_results_df = pd.DataFrame(columns=columns)

# Iterate over each row in the source DataFrame
for index, row in df.iterrows():
    website_url = row['website']
    
    # Find potential washer and dryer inventory links
    lead_links = find_washer_dryer_inventory(website_url)
    
    # Find potential washer and dryer inventory links with prices
    inventory_links_with_prices = find_washer_dryer_inventory_with_prices(lead_links)
    
    # Create a DataFrame with the current crawl results
    current_results_df = pd.DataFrame({
        'name': [row['name']] * len(inventory_links_with_prices),
        'place_id': [row['place_id']] * len(inventory_links_with_prices),
        'website': [website_url] * len(inventory_links_with_prices),
        'inventory_link': inventory_links_with_prices,
        'monitor': [False] * len(inventory_links_with_prices),
    }, columns=columns)

    # Concatenate the current results with the overall crawl_results_df
    crawl_results_df = pd.concat([crawl_results_df, current_results_df], ignore_index=True)


Potential Washer and Dryer Inventory Links:
https://www.conns.com/appliances/washers-and-dryers
https://www.conns.com/appliances/washers-and-dryers
https://www.conns.com/appliances/washers-and-dryers/top-load-washers/
https://www.conns.com/appliances/washers-and-dryers/front-load-washer
https://www.conns.com/appliances/washers-and-dryers/gas-dryers
https://www.conns.com/appliances/washers-and-dryers/electric-dryers
https://www.conns.com/appliances/washers-and-dryers/combo
https://www.conns.com/appliances/dishwashers
https://www.conns.com/appliances/dishwashers
https://www.conns.com/appliances/dishwashers/top-control
https://www.conns.com/appliances/dishwashers/front-control
https://www.conns.com/appliances/parts-and-accessories/washers-dryers
https://www.conns.com/deals/clearance/clearance-appliances/washers/
https://www.conns.com/clearance/appliances/dryers
https://www.conns.com/clearance/appliances/dishwashers
https://www.conns.com/buying-guide/washers-and-dryers
https://www.conns.co

In [14]:

# %%
# Save crawl results DataFrame to CSV
save_to_csv(crawl_results_df)

Data saved to crawl-2024-1-28_15-20.csv
