In [2]:
import requests
from urllib.parse import urlparse

from bs4 import BeautifulSoup
from loguru import logger
import pandas as pd
from selenium import webdriver

In [3]:
URL = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&pc=30&smk=&po1=25&po2=99&shkr1=03&shkr2=03&shkr3=03&shkr4=03&rn=0220&ek=022027580&ek=022018410&ra=013&cb=9.5&ct=13.5&md=02&md=03&md=04&ts=1&et=10&mb=30&mt=9999999&cn=9999999&tc=0400301&fw2="

In [4]:
class SuumoParser():

    @staticmethod
    def data_table_parser(soup):
        data = {}
        data_table = soup.find_all(
            "table", class_="data_table table_gaiyou")[0]
        ths = data_table.find_all("th")
        tds = data_table.find_all("td")

        for th, td in zip(ths, tds):
            data[th.text] = td.text.strip()

        return data

    @staticmethod
    def main_table_parser(soup):
        data = {}
        main_table = soup.find_all("table", class_="property_view_table")[0]
        ths = main_table.find_all("th", class_="property_view_table-title")
        tds = main_table.find_all("td", class_="property_view_table-body")

        for th, td in zip(ths, tds):
            data[th.text] = td.text.strip()

        # Post-processing
        data["駅徒歩"] = data["駅徒歩"].strip()
        station_text = ""
        for stat in data["駅徒歩"].split("\n"):
            station_text += stat.split("/")[1] + ","
        data["駅徒歩"] = station_text[:-1]

        data["専有面積"] = float(data["専有面積"].replace("m2", ""))
        return data

    @staticmethod
    def cost_html_parser(soup):
        data = {}
        notes = soup.find_all("div", class_="property_view_note-list")
        first_row = notes[0]
        data["rental_fee"] = first_row.find_all("span")[0].text
        data["common_fee"] = first_row.find_all("span")[1].text
        second_row = notes[1]
        data["deposit"] = second_row.find_all("span")[0].text
        data["key_money"] = second_row.find_all("span")[1].text

        # Convert to number
        data["rental_fee"] = data["rental_fee"].replace('万円', '')
        data["rental_fee"] = int(float(data["rental_fee"]) * 10000)

        data["common_fee"] = data["common_fee"].replace(
            '管理費・共益費:\xa0', '').replace('円', '')
        if data["common_fee"] == '-':
            data["common_fee"] = 0
        else:
            data["common_fee"] = int(data["common_fee"])

        data["deposit"] = data["deposit"].replace(
            '敷金:\xa0', '').replace('万円', '')
        if data["deposit"] == '-':
            data["deposit"] = 0
        else:
            data["deposit"] = int(float(data["deposit"]) * 10000)

        data["key_money"] = data["key_money"].replace(
            '礼金:\xa0', '').replace('万円', '')
        if data["key_money"] == '-':
            data["key_money"] = 0
        else:
            data["key_money"] = int(float(data["key_money"]) * 10000)

        return data

In [5]:
hrefs = []
for i in range(1, 10):
    if i == 1:
        page_offset = ""
    else:
        page_offset = f"&page={i}"

    logger.info(f"Getting page {i}")
    paging_url = URL + page_offset
    response = requests.get(paging_url)
    soup = BeautifulSoup(response.text, "html.parser")
    room_links = soup.find_all(
        "a", class_="js-cassette_link_href cassetteitem_other-linktext")
    if len(room_links) > 0:
        for room_link in room_links:
            hrefs.append(room_link.attrs["href"])
    else:
        logger.info(f"There is no more room in page {i}: stop")
        break

logger.info(f"Found total {len(hrefs)} hrefs")

2024-01-28 05:47:14.715 | INFO     | __main__:<module>:8 - Getting page 1
2024-01-28 05:47:15.806 | INFO     | __main__:<module>:8 - Getting page 2
2024-01-28 05:47:16.652 | INFO     | __main__:<module>:8 - Getting page 3
2024-01-28 05:47:17.430 | INFO     | __main__:<module>:18 - There is no more room in page 3: stop
2024-01-28 05:47:17.432 | INFO     | __main__:<module>:21 - Found total 66 hrefs


In [6]:
parsed_uri = urlparse(URL)
base_url = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
data = []
for href in hrefs:
    room_url = base_url + href
    logger.info(f"Scrape {room_url}")
    response = requests.get(room_url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Check if the element exists before accessing
    title_tag = soup.find_all("h1", class_="section_h1-header-title")
    if title_tag:
        title = title_tag[0].text
    else:
        # skip this loop
        continue

    cost_data = SuumoParser.cost_html_parser(soup)
    main_table_data = SuumoParser.main_table_parser(soup)
    data_table_data = SuumoParser.data_table_parser(soup)

    # Check for features element and handle if not found
    features_div = soup.find_all("div", class_="bgc-wht ol-g")
    if features_div:
        features = features_div[0].text.strip().split("、")
        features.sort()
    else:
        features = []

    room_data = {
        "title": title,
        "url": room_url,
        **cost_data,
        **main_table_data,
        **data_table_data,
        "features": features
    }

    data.append(room_data)

2024-01-28 05:47:17.451 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000045882323/?bc=100357943343
2024-01-28 05:47:18.129 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000086472028/?bc=100362633909
2024-01-28 05:47:18.784 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000087503103/?bc=100360178698
2024-01-28 05:47:19.502 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000087761026/?bc=100350122973
2024-01-28 05:47:20.182 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000087761025/?bc=100337493945
2024-01-28 05:47:20.771 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000086548005/?bc=100359980717
2024-01-28 05:47:21.337 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000086548006/?bc=100353412566
2024-01-28 05:47:21.943 | INFO     | __main__:<module>:6 - Scrape https://suumo.jp/chintai/jnc_000085689211/?bc=100355281771


In [7]:
df = pd.DataFrame(data)
df["total_fee"] = df["common_fee"] + df["rental_fee"]
df["fee_per_m2"] = df["total_fee"] / df["専有面積"]
interest = ["title", "url", "total_fee", "fee_per_m2",
            "専有面積", "築年数", "所在地", "構造", "rental_fee", "deposit", "key_money"]
columns = df.columns
other_col = list(
    set(columns) - set(interest))
new_columns = interest + other_col
df = df[new_columns]
# drop column with total fee > 135000
df = df[df["total_fee"] <= 135000]
# drop row with duplicated "total_fee", "fee_per_m2", "専有面積", "築年数", "rental_fee", "deposit", "key_money"
columns_to_check_duplicates = [
    "total_fee", "fee_per_m2", "専有面積", "築年数", "rental_fee", "deposit", "key_money"]

# Use the drop_duplicates method to remove rows with duplicated values in those columns
df = df.drop_duplicates(subset=columns_to_check_duplicates, keep='first')
df.sort_values(by=["fee_per_m2"], ascending=True, inplace=True)

In [8]:
df.to_csv("suumo_watcher.csv", index=False)

In [9]:
df.to_excel("suumo_watcher.xlsx", index=False)