In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint

In [10]:
def get_address(soup):
    try:
        address = soup.find("h1", attrs={"class":'Text__StyledText-rui__sc-19ei9fn-0 dEYYQ TypeBody__StyledBody-rui__sc-163o7f1-0 gVxVge'}).text.strip()
    except AttributeError:
        address = ""
    return address

def get_price(soup):
    try:
        price = soup.find("div", attrs={"class": 'Price__Component-rui__x3geed-0 gipzbd'}).text.strip()
    except AttributeError:
        price = ""
    return price

def get_bedroom(soup):
    try:
        bedroom = soup.find("li", attrs={"class": "PropertyBedMetastyles__StyledPropertyBedMeta-rui__a4nnof-0 EMhcO"}).find("span").text.strip()
    except AttributeError:
        bedroom = ""
    return bedroom

def get_bathroom(soup):
    try:
        bathroom = soup.find("li", attrs={"class": "PropertyBathMetastyles__StyledPropertyBathMeta-rui__sc-67m6bo-0 kPVhSw"}).find("span").text.strip()
    except AttributeError:
        bathroom = ""
    return bathroom

def get_land_size_acre_lot(soup):
    try:
        land_size_acre_lot = soup.find("li", attrs={"class": "PropertyLotSizeMetastyles__StyledPropertyLotSizeMeta-rui__sc-1cz4zco-0 iGmjzN"}).find("span").text.strip()
    except AttributeError:
        land_size_acre_lot = ""
    return land_size_acre_lot

def get_flat_size_sqft(soup):
    try:
        flat_size_sqft = soup.find("li", attrs={"class": "PropertySqftMetastyles__StyledPropertySqftMeta-rui__sc-1gdau7i-0 eucnnL"}).find("span").text.strip()
    except AttributeError:
        flat_size_sqft = ""
    return flat_size_sqft

In [22]:
if __name__ == '__main__':
    HEADERS = ({'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'})
    BASE_URL = "https://www.realtor.com/realestateandhomes-search/Alaska/pg-{}"
    PAGE_START = 1
    PAGE_LIMIT = 2  # set the number of pages to scrape here
    d = {"address":[], "price":[], "bedroom":[], "bathroom":[],"flat_size_sqft":[], "land_size_acre_lot":[]}

    for i in range(PAGE_START, PAGE_LIMIT+1):
        URL = BASE_URL.format(i)
        webpage = requests.get(URL, headers=HEADERS)
        soup = BeautifulSoup(webpage.content, "html.parser")
        links = soup.find_all("a", attrs={'rel':'noopener'})
        links_list = []
        for link in links:
            links_list.append(link.get('href'))
        for link in links_list:
            new_webpage = requests.get("https://www.realtor.com/" + link, headers=HEADERS)
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")
            d['address'].append(get_address(new_soup))
            d['price'].append(get_price(new_soup))
            d['bedroom'].append(get_bedroom(new_soup))
            d['bathroom'].append(get_bathroom(new_soup))
            d['flat_size_sqft'].append(get_flat_size_sqft(new_soup))
            d['land_size_acre_lot'].append(get_land_size_acre_lot(new_soup))
            
            # sleep for a random interval to avoid being detected as a bot
            sleep(randint(1,3))
        
    AlaskaHouse_df = pd.DataFrame.from_dict(d)
    AlaskaHouse_df.replace('', np.nan, inplace=True)
    AlaskaHouse_df.dropna(subset=['address'], inplace=True)
    AlaskaHouse_df.reset_index(drop=True, inplace=True)

    # convert string values to float for numeric fields
    numeric_fields = ['price', 'bedroom', 'bathroom','flat_size_sqft']
    AlaskaHouse_df[numeric_fields] = AlaskaHouse_df[numeric_fields].replace('[\$,sqft]', '', regex=True).astype(float)


In [23]:
AlaskaHouse_df

Unnamed: 0,address,price,bedroom,bathroom,flat_size_sqft,land_size_acre_lot
0,"L35 Wooden Wheel Cv, Port Protection, AK 99950",585500.0,6.0,4.0,2000.0,0.64acre lot
1,"8992 N Douglas Hwy, Juneau, AK 99801",575000.0,4.0,2.0,2204.0,0.76acre lot
2,"731 Constitution Dr, Fairbanks, AK 99709",90000.0,5.0,2.0,2026.0,2.54acre lot
3,"63261 Hyacinth Loop, Remote, AK 99611",125000.0,,,,54.52acre lot
4,"3708 Amalga St, Juneau, AK 99801",110000.0,2.0,2.0,980.0,
5,"Uyak Bay, Larsen Bay, AK 99615",250000.0,2.0,1.0,1900.0,13acre lot
6,"415 Rawn Way, Juneau, AK 99801",165000.0,4.0,2.0,1423.0,"2,446sqft lot"
7,"2323 Campbell Pl, Anchorage, AK 99507",250000.0,3.0,3.0,1733.0,"7,841sqft lot"
8,"14912 Petwood Cir, Talkeetna, AK 99676",149000.0,1.0,1.0,768.0,1.61acre lot
9,"280 Parks Hwy, Clear, AK 99760",90000.0,2.0,1.5,1050.0,5acre lot


In [None]:
AlaskaHouse_df.to_csv("Alaska_house_data.csv", header=True, index=False)

print("Scraping completed successfully and data has been saved to the file 'Alaska_house_data.csv'!")