In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np

In [4]:
# 基本URLと最大ページ数の設定
base_url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&pc=30&smk=&po1=25&po2=99&shkr1=03&shkr2=03&shkr3=03&shkr4=03&sc=13103&sc=13113&sc=13109&sc=13111&ta=13&cb=0.0&ct=9999999&et=9999999&mb=0&mt=9999999&cn=30&fw2="
max_page = 5  # 最大ページ数

all_data = []

for page in range(1, max_page + 1):
    url = base_url.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    items = soup.findAll("div", {"class": "cassetteitem"})


    print("page", page, "items", len(items))

    for item in items:
        base_data = {}
        base_data["名称"]     = item.find("div", {"class": "cassetteitem_content-title"}).get_text(strip=True) if item.find("div", {"class": "cassetteitem_content-title"}) else None
        base_data["カテゴリ"] = item.find("div", {"class": "cassetteitem_content-label"}).span.get_text(strip=True) if item.find("div", {"class": "cassetteitem_content-label"}) else None
        base_data["アドレス"] = item.find("li", {"class": "cassetteitem_detail-col1"}).get_text(strip=True) if item.find("li", {"class": "cassetteitem_detail-col1"}) else None
        
        # 駅のアクセス情報をまとめて取得
        base_data["アクセス"] = ", ".join([station.get_text(strip=True) for station in item.findAll("div", {"class": "cassetteitem_detail-text"})])

        construction_info = item.find("li", {"class": "cassetteitem_detail-col3"}).find_all("div") if item.find("li", {"class": "cassetteitem_detail-col3"}) else None
        base_data["築年数"] = construction_info[0].get_text(strip=True) if construction_info and len(construction_info) > 0 else None
        base_data["構造"] = construction_info[1].get_text(strip=True) if construction_info and len(construction_info) > 1 else None

        tbodys = item.find("table", {"class": "cassetteitem_other"}).findAll("tbody")

        for tbody in tbodys:
            data = base_data.copy()
            # 階数情報の正確な取得
            floor_info = tbody.find_all("td")[2].get_text(strip=True) if len(tbody.find_all("td")) > 2 else None
            data["階数"]   = floor_info
            data["家賃"]   = tbody.select_one(".cassetteitem_price--rent").get_text(strip=True) if tbody.select_one(".cassetteitem_price--rent") else None
            data["管理費"] = tbody.select_one(".cassetteitem_price--administration").get_text(strip=True) if tbody.select_one(".cassetteitem_price--administration") else None
            data["敷金"]   = tbody.select_one(".cassetteitem_price--deposit").get_text(strip=True) if tbody.select_one(".cassetteitem_price--deposit") else None
            data["礼金"]   = tbody.select_one(".cassetteitem_price--gratuity").get_text(strip=True) if tbody.select_one(".cassetteitem_price--gratuity") else None
            data["間取り"] = tbody.select_one(".cassetteitem_madori").get_text(strip=True) if tbody.select_one(".cassetteitem_madori") else None
            data["面積"]   = tbody.select_one(".cassetteitem_menseki").get_text(strip=True) if tbody.select_one(".cassetteitem_menseki") else None

            # 物件画像・間取り画像・詳細URLの取得を最後に行う
            property_image_element = item.find(class_="cassetteitem_object-item")
            data["物件画像URL"] = property_image_element.img["rel"] if property_image_element and property_image_element.img else None

            floor_plan_image_element = item.find(class_="casssetteitem_other-thumbnail")
            data["間取画像URL"] = floor_plan_image_element.img["rel"] if floor_plan_image_element and floor_plan_image_element.img else None

            property_link_element = item.select_one("a[href*='/chintai/jnc_']")
            data["物件詳細URL"] = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&pc=30&smk=&po1=25&po2=99&shkr1=03&shkr2=03&shkr3=03&shkr4=03&sc=13103&sc=13113&sc=13109&sc=13111&ta=13&cb=0.0&ct=9999999&et=9999999&mb=0&mt=9999999&cn=30&fw2=" +property_link_element['href'] if property_link_element else None ## 不動産サイトから詳細URLリンクを読み解き作成

            all_data.append(data)    

page 1 items 30
page 2 items 30
page 3 items 30
page 4 items 30
page 5 items 30


In [5]:
df = pd.DataFrame(all_data)
df = df.drop_duplicates() # 重複データの削除
df.head(2)

Unnamed: 0,名称,カテゴリ,アドレス,アクセス,築年数,構造,階数,家賃,管理費,敷金,礼金,間取り,面積,物件画像URL,間取画像URL,物件詳細URL
0,パークタワーグランスカイ,賃貸マンション,東京都品川区東五反田２,"ＪＲ山手線/五反田駅 歩6分, ＪＲ山手線/大崎駅 歩6分, 東急池上線/大崎広小路駅 歩10分",築14年,地下2地上44階建,7階,23.4万円,-,46.8万円,23.4万円,1LDK,44.39m2,https://img01.suumo.com/front/gazo/fr/bukken/5...,https://img01.suumo.com/front/gazo/fr/bukken/5...,https://suumo.jp/jj/chintai/ichiran/FR301FC001...
1,パークタワーグランスカイ,賃貸マンション,東京都品川区東五反田２,"ＪＲ山手線/五反田駅 歩6分, ＪＲ山手線/大崎駅 歩6分, 東急池上線/大崎広小路駅 歩10分",築14年,地下2地上44階建,21階,23.5万円,10000円,23.5万円,35.25万円,1LDK,41.36m2,https://img01.suumo.com/front/gazo/fr/bukken/5...,https://img01.suumo.com/front/gazo/fr/bukken/5...,https://suumo.jp/jj/chintai/ichiran/FR301FC001...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 218 entries, 0 to 226
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   名称       218 non-null    object
 1   カテゴリ     218 non-null    object
 2   アドレス     218 non-null    object
 3   アクセス     218 non-null    object
 4   築年数      218 non-null    object
 5   構造       218 non-null    object
 6   階数       218 non-null    object
 7   家賃       218 non-null    object
 8   管理費      218 non-null    object
 9   敷金       218 non-null    object
 10  礼金       218 non-null    object
 11  間取り      218 non-null    object
 12  面積       218 non-null    object
 13  物件画像URL  218 non-null    object
 14  間取画像URL  218 non-null    object
 15  物件詳細URL  218 non-null    object
dtypes: object(16)
memory usage: 29.0+ KB
