In [32]:
import json

from domus_analytica.config import DomusSettings
from domus_analytica.geopoint import GeoPoint

config = DomusSettings(_env_file=".env.local")

In [33]:
import pymongo

domus_db = pymongo.MongoClient(config.mongo_uri).get_database(config.mongo_db_name)
suumo_search = domus_db.get_collection("suumo_search")
suumo_details = domus_db.get_collection("suumo_details")

In [38]:
from typing import Optional
import re
import pandas as pd
import datetime

table_data = []

for doc in suumo_details.find({'search_time': datetime.datetime(2024, 4, 1, 14, 46, 31, 449000)}):
    id_url = doc["search_details"]["url"]
    result_doc = {"id": id_url}
    content_details = {
        d["type"]: d["content"] for d in doc["content_details"]
    }


    def get_first(regexp: str) -> Optional[str]:
        for d in doc["content_details"]:
            if re.match(regexp, d["type"]) is not None:
                return d["content"]
        return None


    result_doc["name"] = content_details["物件名"]
    result_doc["address"] = content_details["住所"].split("\n")[0]

    if "gps" in doc:
        lat, lon = doc["gps"]["latitude"], doc["gps"]["longitude"]
        result_doc["lat"] = lat
        result_doc["lon"] = lon
    if "価格" in content_details:
        result_doc["price"] = float(re.findall("([+-]?([0-9]*[.])?[0-9]+)万円", content_details["価格"])[0][0])

    if "専有面積" in content_details:
        sr = re.findall("([+-]?([0-9]*[.])?[0-9]+)(m2|㎡)", content_details["専有面積"])
        if sr:
            result_doc["exclusive_area"] = float(sr[0][0])
        else:
            raise ValueError("Can't get area from {}".format(content_details["専有面積"]))
    else:
        print(f"専有面積 can not be found in content_details of {id_url}")

    if "その他面積" in content_details:
        result_doc["common_area"] = sum(
            float(sr[0]) for sr in re.findall("([+-]?([0-9]*[.])?[0-9]+)(m2|㎡)", content_details["その他面積"]))
    else:
        print(f"その他面積 can not be found in content_details of {id_url}")

    completion_date = get_first(".*?(完成時期|築年月).*?")
    if completion_date:
        try:
            cd = re.findall(r"(\d{4})年(\d+)月", completion_date)[0]
            result_doc["completion_date"] = f"{int(cd[0])}-{int(cd[1]):02d}-01"
        except Exception as ex:
            print(f"Can't get time from {completion_date} in {content_details}")
            raise ex

    layout = content_details.get("間取り")
    if layout:
        result_doc["layout_main"] = re.findall(r"(\d(L|D|K)+)", layout)[0][0]
        storage_room = re.findall(r"\+(\d{0,1})S", layout)
        if len(storage_room) > 0:
            if storage_room[0] == "":
                result_doc["layout_storage_room"] = 1
            else:
                result_doc["layout_storage_room"] = int(storage_room[0])
        else:
            result_doc["layout_storage_room"] = 0

    direction = content_details.get("向き")
    if direction:
        result_doc["direction"] = direction

    result_doc["pet"] = re.match("ペット", doc["search_details"]["title"]) is not None
    the_floor = content_details.get("所在階", get_first("所在階"))
    if the_floor:
        result_doc["floor"] = int(re.findall("(\d+)階", the_floor)[0])

    total_floors = get_first(".*?階建.*?")
    if total_floors:
        try:
            result_doc["total_floors"] = int(re.findall("(\d+)階建", total_floors)[0])
        except Exception as ex:
            print(f"Can't parse {total_floors}")
            raise ex


    for t in ["restaurant", "supermarket", "convenience_store", "drugstore", "park", "cafe", "bus_station",
              "primary_school"]:
        if f"nearby_{t}" in doc:
            result_doc[f"{t}_count"] = len(doc[f"nearby_{t}"]["results"])
    
    if "gps" in doc:
        # Nearest Station
        this_location = GeoPoint.parse_obj(doc["gps"])
        if "nearby_train_station" in doc:
            station_location = doc["nearby_train_station"]["results"][0]["geometry"]["location"]
            result_doc["distance_to_nearest_station"] = (
                    GeoPoint(latitude=station_location["lat"], longitude=station_location["lng"])
                    - this_location
            )
        for p, n in [
            (
                    GeoPoint(latitude=33.59118086094799, longitude=130.398581611983),
                    "tenjin",
            ),
            (
                    GeoPoint(latitude=33.5873955705478, longitude=130.41968891935684),
                    "hakata"
            ),
            (
                    GeoPoint(latitude=33.59030230439562, longitude=130.37888950301377),
                    "ohori_park"
            ),
        ]:
            result_doc[f"distance_to_{n}"] = this_location - p

    def get_monthly_fee(key):
        return sum(float(r) for r in re.findall("(\d+)円／月", content_details[key]))


    result_doc["monthly_fee_manage"] = get_monthly_fee("管理費")
    result_doc["monthly_fee_repair"] = get_monthly_fee("修繕積立金")
    result_doc["monthly_fee_repair_fund"] = get_monthly_fee("修繕積立基金")
    result_doc["monthly_fee_others"] = get_monthly_fee("諸費用")
    result_doc["monthly_fee_total"] = sum([
        result_doc["monthly_fee_manage"],
        result_doc["monthly_fee_repair"],
        result_doc["monthly_fee_repair_fund"],
        result_doc["monthly_fee_others"]
    ])

    table_data.append(result_doc)

df = pd.DataFrame(table_data)
df

Unnamed: 0,id,name,address,lat,lon,price,exclusive_area,common_area,completion_date,layout_main,...,primary_school_count,distance_to_nearest_station,distance_to_tenjin,distance_to_hakata,distance_to_ohori_park,monthly_fee_manage,monthly_fee_repair,monthly_fee_repair_fund,monthly_fee_others,monthly_fee_total
0,/ms/chuko/fukuoka/sc_fukuokashihigashi/nc_7437...,モントーレブルー・ラ・メール Fukuoka,福岡県福岡市東区西戸崎２-1-30,33.6461037815746,130.3599501447,2080.0,57.25,11.04,2007-01-01,2LDK,...,0,0.502634,7.085989,8.566679,6.455506,7800.0,3400.0,0,800.0,12000.0
1,/ms/chuko/fukuoka/sc_fukuokashihigashi/nc_7460...,ファーネスト箱崎公園,福岡県福岡市東区松田２,33.6203572036973,130.443550474538,2080.0,69.85,38.10,2008-06-01,3LDK,...,1,0.804436,5.285324,4.284875,6.865806,7600.0,9100.0,0,0.0,16700.0
2,/ms/chuko/fukuoka/sc_fukuokashihigashi/nc_7250...,Asobi+　JGMヴェルデ香椎南弐番館,福岡県福岡市東区八田１,33.63406257298,130.456704058744,2490.0,93.33,24.55,2001-06-01,4LDK,...,4,0.875963,7.199107,6.226393,8.705188,7500.0,4900.0,0,0.0,12400.0
3,/ms/chuko/fukuoka/sc_fukuokashiminami/nc_73552...,エイルヴィラクレアテュール井尻南II,福岡県福岡市南区曰佐２,33.540890670713,130.438664435095,2580.0,75.42,20.88,1999-05-01,3LDK,...,5,1.319945,6.720793,5.468257,7.810530,7430.0,8300.0,0,0.0,15730.0
4,/ms/chuko/fukuoka/sc_fukuokashihakata/nc_74582...,アンピール吉塚,福岡県福岡市博多区吉塚６,33.6075918806721,130.429482478425,2780.0,65.10,14.00,2000-11-01,3LDK,...,7,0.548731,3.398220,2.424849,5.070826,5200.0,6840.0,0,0.0,12040.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,/ms/chuko/fukuoka/sc_fukuokashisawara/nc_73999...,グランドメゾン百道浜,福岡県福岡市早良区百道浜３,33.5915565318577,130.352199721123,7800.0,94.02,18.10,2008-01-01,2LDK,...,4,2.661638,4.301424,6.275862,2.479023,3100.0,6500.0,0,200.0,9800.0
647,/ms/chuko/fukuoka/sc_fukuokashichuo/nc_73742555/,ネクサス薬院,福岡県福岡市中央区薬院１,33.5820367654343,130.397959959864,7980.0,78.92,19.75,2015-06-01,3LDK,...,8,0.387372,1.019605,2.101639,1.993657,6800.0,300.0,0,1100.0,8200.0
648,/ms/chuko/fukuoka/sc_fukuokashisawara/nc_74606...,フリーディア西新プレミアム,福岡県福岡市早良区西新６,33.5857458518373,130.354769050504,7980.0,81.77,26.46,2022-05-01,3LDK,...,6,2.763019,4.107896,6.023474,2.293688,2100.0,7600.0,0,250.0,9950.0
649,/ms/chuko/fukuoka/sc_fukuokashiminami/nc_74158...,レークヒルズ野多目,福岡県福岡市南区野多目３,33.542951,130.420432,2580.0,89.57,19.12,2002-03-01,3LDK,...,6,1.894546,5.739001,4.948263,6.529692,6400.0,3100.0,0,0.0,9500.0


In [39]:
with open("data/table_data.json", "w") as fp:
    json.dump(table_data, fp)