In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import geopandas as gpd
import warnings
from sklearn.neighbors import BallTree
import lightgbm as lgb
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

# Display all columns
pd.set_option('display.max_columns', None)
input_path = '../input/'
gdf_land = gpd.read_file(os.path.join(input_path, 'L02-25.geojson'))
df_land = gdf_land.copy()
df_land['price'] = df_land['L02_008']
df = pd.read_csv(os.path.join(input_path, 'train.csv'))
test = pd.read_csv(os.path.join(input_path, 'test.csv'))
gdf_land.info()

In [18]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree

def preprocess_data(input_df, df_land, is_train=True):
    """
    データフレームを受け取り、特徴量エンジニアリングを行って返す関数
    
    Args:
        input_df (pd.DataFrame): 処理したいデータ (train or test)
        df_land (pd.DataFrame): 国土数値情報の地価データ (lat, lon, price列必須)
        is_train (bool): 学習用データかどうか (Trueなら目的変数を作る)
    
    Returns:
        df (pd.DataFrame): 処理後のデータ
        features (list): 学習に使う特徴量のリスト
        target (pd.Series or None): 学習用の場合、変換済みの目的変数 (対数単価)
    """
    # 元のデータを壊さないようにコピー
    df = input_df.copy()
    if is_train:
        # 例えば「200㎡以上」は外れ値として捨てる
        # (分布を見て閾値は調整してください。一旦200にしておきます)
        df = df[df['unit_area'] < 2000]
    # -----------------------------------------------------
    # 1. 築年数の計算 (yyyymm -> 月数)
    # -----------------------------------------------------
    # 文字列にして日付型に変換
    df['temp_time'] = pd.to_datetime(df['year_built'].astype(str), format='%Y%m', errors='coerce')
    
    # 基準日（2025年12月など）からの月数を計算
    base_date = pd.to_datetime('2025-12-01')
    # 日数 ÷ 30.44 で「月数」にする (float)
    df['building_month'] = (base_date - df['temp_time']).dt.days / 30.44
    df['building_month'] = df['building_month'].astype(float) # 念のため型変換

    # -----------------------------------------------------
    # 2. 国土数値情報（地価）の結合 (BallTree)
    # -----------------------------------------------------
    # 地価データの準備 (NaN削除 & ラジアン変換)
    df_land['lat'] = df_land.geometry.y
    df_land['lon'] = df_land.geometry.x
    land_clean = df_land[['lat', 'lon', 'price']].dropna()
    land_rad = np.deg2rad(land_clean[['lat', 'lon']])
    input_rad = np.deg2rad(df[['lat', 'lon']])
    
    # BallTree構築 (metric='haversine' で地球の丸みを考慮)
    # ※毎回作ると少し重いので、本来は関数の外で作って渡すのがベストですが、今回はここでやります
    tree = BallTree(land_rad, metric='haversine')
    
    # 最寄り検索 (k=1)
    dists, indices = tree.query(input_rad, k=1)
    
    # 結合
    df['land_price'] = land_clean['price'].values[indices.flatten()]
    df['dist_to_land_price'] = dists.flatten() * 6371 * 1000 # メートル変換
    
    # 地価の対数変換 (特徴量として使いやすくする)
    df['log_land_price'] = np.log1p(df['land_price'])

    # -----------------------------------------------------
    # 3. カテゴリ変数の処理
    # -----------------------------------------------------
    cat_cols = ['addr1_2', 'layout', 'direction', 'structure'] # 必要に応じて追加
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).fillna('unknown').astype('category')
    
    # -----------------------------------------------------
    # 4. 特徴量リストの定義
    # -----------------------------------------------------
    # 数値特徴量 + カテゴリ特徴量
    num_features = ['unit_area', 'log_land_price', 'dist_to_land_price', 'building_month']
    features = num_features + [c for c in cat_cols if c in df.columns]

    # -----------------------------------------------------
    # 5. 目的変数の作成 (Trainのみ)
    # -----------------------------------------------------
    target = None
    if is_train:
        # 単価を計算して対数変換 (MAPE対策)
        # room_price_per_area がなければ money_room / unit_area で作る
        if 'room_price_per_area' not in df.columns:
             # money_roomがある前提
             current_price = df['money_room']
        else:
             current_price = df['room_price_per_area'] * df['unit_area'] # 一旦総額に戻すなど調整必要かも

        # 今回のコンペに合わせて調整：
        # 「単価」を予測ターゲットにする場合
        unit_price = df['money_room'] / df['unit_area']
        target = np.log1p(unit_price)

    return df, features, target

In [16]:
#一気に前処理
df_train_processed, feature_cols, y_train = preprocess_data(df, df_land, is_train=True)
df_train_processed.head()

Unnamed: 0,target_ym,money_room,building_id,building_status,building_create_date,building_modify_date,building_type,building_name,building_name_ruby,homes_building_name,homes_building_name_ruby,unit_count,full_address,lon,lat,building_structure,total_floor_area,building_area,floor_count,basement_floor_count,year_built,building_land_area,land_area_all,unit_area_min,unit_area_max,building_land_chimoku,land_youto,land_toshi,land_chisei,land_area_kind,land_setback_flg,land_setback,land_kenpei,land_youseki,land_road_cond,land_seigen,building_area_kind,management_form,management_association_flg,reform_exterior,reform_exterior_other,reform_exterior_date,reform_common_area,reform_common_area_date,building_tag_id,unit_id,unit_name,name_ruby,room_floor,balcony_area,dwelling_unit_window_angle,room_count,unit_area,floor_plan_code,reform_date,reform_place,reform_place_other,reform_wet_area,reform_wet_area_other,reform_wet_area_date,reform_interior,reform_interior_other,reform_interior_date,reform_etc,renovation_date,renovation_etc,unit_tag_id,bukken_id,snapshot_create_date,new_date,snapshot_modify_date,timelimit_date,bukken_type,flg_investment,empty_number,empty_contents,post1,post2,addr1_1,addr1_2,addr2_name,addr3_name,nl,el,rosen_name1,eki_name1,bus_stop1,bus_time1,walk_distance1,rosen_name2,eki_name2,bus_stop2,bus_time2,walk_distance2,traffic_other,traffic_car,snapshot_land_area,snapshot_land_shidou,land_shidou_a,land_shidou_b,land_mochibun_a,land_mochibun_b,house_area,flg_new,house_kanrinin,room_kaisuu,snapshot_window_angle,madori_number_all,madori_kind_all,money_kyoueki,money_kyoueki_tax,money_rimawari_now,money_shuuzen,money_shuuzenkikin,money_sonota_str1,money_sonota1,money_sonota_str2,money_sonota2,money_sonota_str3,money_sonota3,parking_money,parking_money_tax,parking_kubun,parking_distance,parking_number,parking_memo,genkyo_code,usable_status,usable_date,school_ele_name,school_ele_distance,school_ele_code,school_jun_name,school_jun_distance,school_jun_code,convenience_distance,super_distance,hospital_distance,park_distance,drugstore_distance,bank_distance,shopping_street_distance,est_other_name,est_other_distance,statuses,parking_keiyaku,money_hoshou_company,free_rent_duration,free_rent_gen_timing,temp_time,building_month,land_price,dist_to_land_price,log_land_price
0,201901,13980000,206271,1,2014-06-27 21:09:41,2019-01-24 00:10:08,4,,,桑名市東正和台7丁目10-11,,,三重県桑名市東正和台7丁目10-11,136.637467,35.047688,1.0,106.82,,2.0,,199204.0,188.490005,188.490005,,,1.0,1.0,1.0,,1.0,,,50.0,80.0,,,1.0,,,,外壁塗装,201807.0,,,210202/210301/210101,262186,,,,,,4.0,106.82,450.0,,,,1/2/3/4,,201807.0,,畳表替え,201807.0,,,,310501/220701/220601/230401,360840,2018-10-30 00:00:00,2018-10-30 00:00:00,2019-01-22 13:20:11,2019-02-05 00:00:00,1202,,1.0,,511.0,932.0,24,205,東正和台7丁目,,126159934.0,491905398.0,三岐鉄道北勢線,在良,,,1840.0,,,,,,コミュニティバス「東正和台」停より約 500m,,188.49,,,,,,106.82,0.0,,,,4,50,,3.0,,,,,,,,,,0.0,,1.0,,2.0,,2.0,1,,桑部小学校,2000.0,,正和中学校,2000.0,,,,,,,,,,,210101/220701/220601/230401/310501/210301/210202,,,,,NaT,,1,2085.85452,0.693147
1,201901,24480000,83315,1,2014-06-27 21:09:43,2019-04-03 00:10:08,4,,,桑名市松ノ木5丁目,,1.0,三重県桑名市松ノ木5丁目14-8,136.639936,35.074625,10.0,,,2.0,,198108.0,290.519989,,,,,1.0,1.0,1.0,2.0,1.0,1.2,50.0,80.0,1.0,,,,,2.0,外壁塗装,201706.0,,,330501/210301/210101/210201,35726,,,,,5.0,4.0,134.039993,450.0,,,,1/2/3/4,キッチン・ユニットバス・洗面化粧台コーティング/トイレ新品,201706.0,2/4,,201706.0,,2017-06-01,外装外壁塗装 内装システムキッチン新品/お風呂新品/洗面台新品/トイレ新品/フローリング上張...,260201/260301/220801/230601/260101/230101/2203...,267022,2018-10-30 00:00:00,2018-10-30 00:00:00,2019-01-30 15:15:49,2019-02-13 00:00:00,1202,,,,511.0,902.0,24,205,松ノ木5丁目,,126256831.0,491914000.0,三岐鉄道北勢線,星川,,,1920.0,,,,,,コミュニティバス「法務局前」停より約400m,,290.52,,,,,,134.04,0.0,,,,4,50,,3.0,,,,,,,,,,0.0,,1.0,,3.0,,2.0,1,,大山田南小学校,350.0,,陵成中学校,1100.0,,,,,,,,,,,210101/220701/220601/220801/230601/250201/2103...,,,,,NaT,,1,1576.877029,0.693147
2,201901,24480000,140201,1,2014-06-27 21:09:43,2020-06-06 00:10:11,4,,,桑名市松ノ木七丁目,,1.0,三重県桑名市松ノ木7丁目16-10,136.644708,35.072248,1.0,,,2.0,,199506.0,235.649994,,,,,1.0,1.0,1.0,2.0,1.0,1.2,50.0,80.0,2.0,,,,,2.0,外壁塗装,201804.0,,,210201/330501/334101/210101/210301/340301,116820,,,,,5.0,4.0,114.589996,450.0,,,,1/2/3/4,キッチン・ユニットバス・洗面化粧台コーティング/トイレ新品,201804.0,2/4,,201804.0,,2018-04-01,外装外壁塗装 内装システムキッチン新品/お風呂新品/洗面台新品/トイレ新品/フローリング上張...,260201/260301/220801/230601/260101/230101/2203...,194544,2018-10-30 00:00:00,2018-10-30 00:00:00,2019-01-30 15:15:49,2019-02-13 00:00:00,1202,,1.0,,511.0,902.0,24,205,松ノ木7丁目,,126248373.0,491931000.0,三岐鉄道北勢線,蓮花寺,,,2000.0,,,,,,コミュニティバス「松ノ木南」停より約350m,,235.65,,,,,,114.59,0.0,,,,4,50,,3.0,,,,,,,,,,0.0,,1.0,,2.0,,2.0,1,,大山田南小学校,850.0,,陵成中学校,1200.0,,,,,,,,,,,210101/220701/220601/230401/220801/310501/2306...,,,,,NaT,,1,1866.282851,0.693147
3,201901,16300000,216551,1,2014-06-27 19:24:11,2019-04-26 00:10:08,4,,,中古戸建 知多市八幡字荒井,,,愛知県知多市八幡荒井105-1,136.875602,35.003174,1.0,106.809998,,2.0,,200203.0,169.729996,,,,1.0,12.0,1.0,,1.0,1.0,,60.0,200.0,,準防火地域、文化財保護法,,,,,,,,,210101/210401/210301,281648,,,,,5.0,2.0,106.809998,250.0,,,,,,,,,,,,,220201/253501/253401/290401/290101/290901/2304...,345772,2018-12-01 00:00:00,2018-12-01 00:00:00,2019-01-30 21:54:10,2019-02-13 00:00:00,1202,0.0,0.0,,478.0,1.0,23,224,八幡字荒井,,125999722.0,492762730.0,名鉄常滑線,寺本,,,480.0,名鉄常滑線,尾張横須賀,,,1600.0,,,169.73,,,,,,106.81,0.0,,,,3,50,0.0,2.0,,0.0,,,,,,,,0.0,,1.0,0.0,3.0,,1.0,2,,八幡小学校,1400.0,,八幡中学校,1400.0,,,,,,,,,コンビニ,300.0,210101/290901/230401/210301/210401,,,,,NaT,,1,1180.869442,0.693147
4,201901,18800000,134968,1,2014-06-27 19:24:12,2025-09-15 02:49:23,1,ロイヤル知多寺本,,ロイヤル知多寺本,,32.0,愛知県知多市八幡荒古後12-2,136.875754,35.002403,4.0,76.739998,,6.0,,200703.0,,,,,,3.0,1.0,,,1.0,,60.0,200.0,,準防火地域,1.0,3.0,,,,,,,210101/310101/320101/321001/321101/320901/2102...,85898,102.0,,1.0,0.0,,3.0,76.739998,350.0,,,,,,,,,,,,,230501/220401/250101/220701/340101/220601/2909...,172718,2018-12-22 00:00:00,2018-12-22 00:00:00,2019-01-30 21:54:11,2019-02-13 00:00:00,1302,0.0,0.0,102.0,478.0,1.0,23,224,八幡字荒古後,,125996801.0,492763533.0,名鉄常滑線,寺本,,,320.0,名鉄常滑線,朝倉,,,1600.0,,,,,,,,,76.74,0.0,3.0,1.0,,3,50,8440.0,2.0,,3670.0,,,,,,,,3000.0,,1.0,0.0,,,2.0,2,,八幡小学校,1080.0,,八幡中学校,1410.0,,,,,,1060.0,,,駅,300.0,220801/220401/250101/220701/321101/220601/2304...,,,,,NaT,,1,1217.276397,0.693147


          lon        lat   price
0  141.307294  43.063568  259000
1  141.349437  43.036387  235000
2  141.348637  43.029990  204000
3  141.315828  43.035672   68500
4  141.310798  43.058387  383000
Constructing BallTree...
Querying nearest neighbors...


Done
         lat         lon  land_price  dist_to_land_price
0  35.047688  136.637467       40800        2.085855e+06
1  35.074625  136.639936       53200        1.576877e+06
2  35.072248  136.644708       53200        1.866283e+06
3  35.003174  136.875602      140000        1.180869e+06
4  35.002403  136.875754      140000        1.217276e+06


In [43]:
df[['log_land_price', 'log_room_price_per_area']].corr()

Unnamed: 0,log_land_price,log_room_price_per_area
log_land_price,1.0,0.766658
log_room_price_per_area,0.766658,1.0


In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error

# --- 1. データ準備（さっきと同じ） ---
# 特徴量はそのままでOK
features = ['unit_area', 'log_land_price', 'dist_to_land_price', 'building_month']

# ★必勝ポイント：ターゲットを「対数（Log）」にする！
# これでAIは「金額」ではなく「比率」を学習し始めます
y = np.log1p(df['room_price_per_area']) 

# --- 2. 分割 ---
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 3. 学習 ---
# 評価指標(metric)も 'mape' にしておくとログが見やすいですが、
# Log変換してるので 'mae' や 'rmse' でも内部的にはパーセント誤差を減らす動きになります
model = lgb.LGBMRegressor(random_state=42)
model.fit(X_train, y_train)

# --- 4. 予測＆評価 ---
# 予測結果は「ログ」で出てくるので、現実の価格に戻します (exp)
pred_log = model.predict(X_valid)
pred_price_per_m2 = np.expm1(pred_log)

# 正解データもログから戻す
actual_price_per_m2 = np.expm1(y_valid)

# 単価 × 面積 ＝ 総額
pred_total = pred_price_per_m2 * X_valid['unit_area']
actual_total = actual_price_per_m2 * X_valid['unit_area']

# --- 5. スコア計算（MAPE） ---
def get_mape(true, pred):
    return np.mean(np.abs((true - pred) / true)) * 100

score_mape = get_mape(actual_total, pred_total)
print(f"誤差率 (MAPE): {score_mape:.2f} %")

KeyError: "['building_month'] not in index"