In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# 変数urlにSUUMOホームページのURLを格納
url = "https://suumo.jp/chintai/tokyo/sc_shinjuku/?page={}"

# アクセスするためのURLをtarget_urlに格納する
target_url = url.format(1)

print(target_url)

# target_urlへのアクセス結果を変数rに格納
r = requests.get(target_url)

# 取得結果を解析してBeautifulSoupに格納
soup = BeautifulSoup(r.text)

contents = soup.find_all('div', class_='cassetteitem')

https://suumo.jp/chintai/tokyo/sc_shinjuku/?page=1


In [3]:
page = soup.select_one('ol.pagination-parts li:last-child a').text
page_max = int(page)

d_list = []
contents = soup.find_all('div', class_='cassetteitem')

for page_number in range(1, page_max):
    page_url = url.format(page_number)
    page_response = requests.get(page_url)

    for content in contents:
        detail = content.find('div', class_='cassetteitem-detail')
        table = content.find('table', class_='cassetteitem_other')

        title = detail.find('div', class_='cassetteitem_content-title').text.replace('\n','').replace('\r','').replace('\t','').replace('\u3000','')
        address = detail.find('li', class_='cassetteitem_detail-col1').text.replace('\n','').replace('\r','').replace('\t','').replace('\u3000','')
        access = detail.find('li', class_='cassetteitem_detail-col2').text.replace('\n','').replace('\r','').replace('\t','').replace('\u3000','')
        age = detail.find('li', class_='cassetteitem_detail-col3').text.replace('\n','').replace('\r','').replace('\t','').replace('\u3000',''),

        tr_tags = table.find_all('tr', class_='js-cassette_link')
        
        for tr_tag in tr_tags:
            floor, price, first_fee, capacity = tr_tag.find_all('td')[2:6]
            fee, management_fee = price.find_all('li')
            deposit, gratuity = first_fee.find_all('li')
            madori, menseki = capacity.find_all('li')
            
            d =  {
                'title': title,
                'address': address,
                'access': access,
                'age': age,
                'floor': floor.text.replace('\n','').replace('\r','').replace('\t','').replace('\u3000',''),
                'fee': fee.text,
                'management_fee': management_fee.text,
                'deposit': deposit.text,
                'gratuity': gratuity.text,
                'madori': madori.text,
                'menseki': menseki.text
            }

            d_list.append(d)

In [4]:
df = pd.DataFrame(d_list)
#重複を削除
df.drop_duplicates(subset=['title','address','age','floor','menseki'], inplace=True)

#データのクレンジング
#最寄り駅を抽出
df['station'] = df['access'].str.extract(r'/([\u4E00-\u9FFF]+[\u4E00-\u9FFFぁ-んァ-ン]*)駅')
#築年数の数字だけ抽出
df['age'] = df['age'].str.extract(r'築(\d+)年')
#管理費の金額だけ抽出
df['management_fee'] = df['management_fee'].replace('-','0').str.replace('円', '').astype(int)
#面積を数字だけ
df['menseki'] = df['menseki'].astype(str).str.replace('m2','').astype(float)

#家賃、敷金、礼金を金額に
df['fee'] = df['fee'].astype(str).replace('-','0').str.replace('万円', '').astype(float)*10000
df['deposit'] = df['deposit'].astype(str).replace('-','0').str.replace('万円', '').astype(float)*10000
df['gratuity'] = df['gratuity'].astype(str).replace('-','0').str.replace('万円', '').astype(float)*10000

df['fee'] = df['fee'].astype(int)
df['deposit'] = df['deposit'].astype(int)
df['gratuity'] = df['gratuity'].astype(int)

In [26]:
#Google spreadsheet APIを操作するライブラリをインポート
import gspread
from gspread_dataframe import set_with_dataframe

In [31]:
#認証の手続き。https://docs.gspread.org/en/v5.12.0/oauth2.htmlから
from google.oauth2.service_account import Credentials

scopes = [
    'https://www.googleapis.com/auth/spreadsheets',
    'https://www.googleapis.com/auth/drive'
]

credentials = Credentials.from_service_account_file(
    'secretkey.json',
    scopes=scopes
)

gc = gspread.authorize(credentials)

In [32]:
#操作するファイルとシートを指定
SP_SHEET_KEY = '1XTzkjgKNMJAVdrqMHikvjPNGNcvhQwBxPvVi7P6IC70'
SP_SHEET = 'houselist'

sh = gc.open_by_key(SP_SHEET_KEY)
worksheet = sh.worksheet(SP_SHEET)

In [35]:
#dfを指定したシートの1,1から書き込む
set_with_dataframe(worksheet, df, row=1, col=1)