In [258]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error


pd.set_option('display.max_columns', 100)

root = "../"

In [259]:
def convertDummies(df, col, drop=True):
    """
    ワンホットエンコーディングを行う

    Parameters
    ----------
    df:pd.DataFrame
        処理を行うデータフレーム 
    col:list
        ワンホットエンコーディングを行う特徴量名のリスト

    Returns
    -------
    df
        ワンホットエンコーディングを行ったデータフレーム 
    """
    if drop:
        for c in col:
            df = pd.concat([df, pd.get_dummies(df[c])], axis=1).drop([c], axis=1)
    else:
        for c in col:
            df = pd.concat([df, pd.get_dummies(df[c])], axis=1)
    
    return df

In [260]:
l = pd.read_csv(root+"input/list_page_data.csv")
l = l.drop_duplicates()
d = pd.read_csv(root+"input/detail_page_data.csv")
d = d.drop_duplicates()

## リストページ

In [261]:
l.detail_url.value_counts()[:3]

https://suumo.jp/chintai/jnc_000068802081/?bc=100256104423    2
https://suumo.jp/chintai/jnc_000057853373/?bc=100196884781    2
https://suumo.jp/chintai/jnc_000068632335/?bc=100252192140    2
Name: detail_url, dtype: int64

In [262]:
keys = l.detail_url.value_counts().keys()
values = l.detail_url.value_counts().values

index = [True] * len(l)

for key, value in zip(keys, values):
    if value > 1:
        index = (index) & (l["detail_url"] != key)
    else:
        break
        
l = l[index]

In [None]:
len(l)

In [None]:
l.head(3)

## 詳細ページ

In [None]:
d.url.value_counts()

In [None]:
len(d)

In [267]:
# d.head(1)

## 結合

In [268]:
temp1 = set(l.detail_url)
temp2 = set(d.url)

In [269]:
# temp1 - temp2

In [270]:
# temp2 - temp1

In [271]:
train = pd.merge(left=l, right=d, left_on="detail_url", right_on="url", how="inner")
train = train.drop(["url_x", "url_y", "detail_url"], axis=1)

In [None]:
train.columns

In [None]:
train.head(1)

## 前処理

In [274]:
def preprocessing(df):
    def make_target(df):
        df["kanrihi"] = df["kanrihi"].fillna("0円").map(lambda d: "0円" if d=="-" else d)
        yatin = df["yatin"].map(lambda d: float(d.split("万")[0])*10000)
        kanrihi = df["kanrihi"].map(lambda d: int(d.split("円")[0]))
        df["target"] = yatin + kanrihi
        df = df.drop(["kanrihi", "yatin"], axis=1)
        return df
    
    def preprocess_madori(df):
        df["madori_num"] = df["madori"].map(lambda d: int(d[0]) if d[0].isnumeric() else 0)
        df["madori_kind"] = df["madori"].map(lambda d: d[1:] if d[0].isnumeric() else 0)
        df = convertDummies(df, ["madori_kind"], drop=True)
        df = df.drop(["madori"], axis=1)
        return df 
    
    def preprocess_menseki(df):
        df["menseki"] = df["menseki"].map(lambda d: float(d[:-1]))
        return df
    
    def preprocess_ekitoho(df):
        df["eki_sen"] = df["ekitoho"].map(lambda d: d.split("/")[0])
        df["eki"] = df["ekitoho"].map(lambda d: d.split("/")[1].split(" ")[0])
        df["toho"] = df["ekitoho"].map(lambda d: d.split("/")[1].split(" ")[1])
        df["toho"] = df["toho"].map(lambda d: d.split("分")[0])
        df["toho"] = df["toho"].map(lambda d: int(d.split("歩")[1]) if "歩" in d else int(d.split("バス")[1]))
        df = convertDummies(df, ["eki_sen", "eki"], drop=True)

        df = df.drop(["ekitoho"], axis=1)
        return df
    
    def preprocess_kaidate(df):
        df["kaidate_1"] = df["kaidate"].map(lambda d: d.split("/")[0].split("階")[0] if "/" in d else "0")

        df["kaidate_1"] = df["kaidate_1"].map(lambda d: d.split("-")[0] if "-" in d else d)
        df["kaidate_1"] = df["kaidate_1"].map(lambda d: int(d) if d[0].isnumeric() else 0)
        
        df["kaidate_2"] = df["kaidate"].map(lambda d: d.split("/")[1].split("階")[0] if "/" in d else "-1")
        df["kaidate_2"] = df["kaidate_2"].map(lambda d: int(d) if d[0].isnumeric() else -1)

        df = df.drop(["kaidate"], axis=1)
        return df
    
    def preprocess_kozo(df):
        converter = {"その他": "minority_kozo", "プレコン": "minority_kozo", "鉄骨プレ": "minority_kozo", "ブロック": "minority_kozo"}
        df["kozo"] = df["kozo"].map(lambda d: converter[d] if d in converter.keys() else d)
        df = convertDummies(df, ["kozo"], drop=True)
        return df
    
    def madori_detail(df):
        # 処理が大変なので、一旦ドロップする
        df = df.drop(["madori_detail"], axis=1)
        return df
    
    def preprocess_syozaiti(df):
        check = train["syozaiti"].map(lambda d: d.split("区")[1]).value_counts()
        df["syozaiti"] = df["syozaiti"].map(lambda d: d.split("区")[1]).map(lambda d: "minority_syozaiti" if check[d] < 10 else d)
        df = convertDummies(df, ["syozaiti"], drop=True)
        return df
    
    def preprocess_tikunengetu(df):
        df["tikunengetu"] = df["tikunengetu"].map(lambda d: int(d[:4]))
        return df
    
    def drop_columns(df):
        rm_cols = [
            "reikin", "shikikin", "keiyaku_kikan", "nyukyo", "others", "sonpo", "syohiyo", "syokihi",
            "title", "tyukai_tesuryo", "zyoken", "tyusyazyo"
        ]
        return df.drop(rm_cols, axis=1)
        
    df = make_target(df)
    df = preprocess_madori(df)
    df = preprocess_menseki(df)
    df = preprocess_ekitoho(df)
    df = preprocess_kaidate(df)
    df = preprocess_kozo(df)
    df = madori_detail(df)
    df = preprocess_syozaiti(df)
    df = preprocess_tikunengetu(df)
    df = drop_columns(df)
    
    return df

In [None]:
temp = preprocessing(train)
temp.target.plot.hist()

In [275]:
X = preprocessing(train)
X, X_test = train_test_split(
    X,
    test_size=0.25, 
    shuffle=True, 
    random_state=42, 
   )

y = X.target
y_test = X_test.target
X = X.drop("target", axis=1)
X_test = X_test.drop("target", axis=1)

In [276]:
# lightGBM
def lgbGridSearch(X, y, kf=KFold(n_splits=5), scoring="neg_mean_squared_error"):
    grid_param ={
        'n_estimators': [5000],'max_depth': [8],'num_leaves': [15, 30, 50, 80, 120],'learning_rate': [0.01, 0.1],
    
    }
    
    lgbC_grid_search = GridSearchCV(
        lgb.LGBMRegressor(n_jobs=-1, random_state=42, metric="MSE"), 
        grid_param, 
        cv=kf, 
        n_jobs=-1, 
        verbose=1, 
        scoring=scoring
    )

    lgbC_grid_search.fit(X, y)
    print("Best parameters: ", lgbC_grid_search.best_params_)
    print("Best cross-validation score: ", lgbC_grid_search.best_score_)
    
    print("---------------")
    return lgbC_grid_search, lgbC_grid_search.best_params_, lgbC_grid_search.best_score_

In [None]:
X.head()

In [None]:
lgbC_grid_search, lgbC_grid_search_best_params, lgbC_grid_search_best_score = lgbGridSearch(X, y)

In [278]:
model = lgbC_grid_search.best_estimator_
predict = model.predict(X_test)

In [279]:
lgb_predicts = pd.DataFrame()
lgb_predicts["predict"] = predict
lgb_predicts["answer"] = y_test.reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline


plt.scatter(
    lgb_predicts["answer"], 
    lgb_predicts["predict"], 
    alpha=0.5,
    label=("answer", 'predict'), 
    color='navy',
)

m = min(lgb_predicts["predict"])
M = max(lgb_predicts["predict"])
l = len(lgb_predicts)

plt.plot(
    np.linspace(m, M, l), 
    np.linspace(m, M, l), 
    color="orange",
    linewidth=4,
)

plt.title("predict:answer", {"fontsize": 15})
plt.xlabel("answer", {"fontsize": 15})
plt.ylabel("predict", {"fontsize": 15})
plt.tick_params(labelsize=15)
plt.legend(prop={"size": 15}, loc="best")


# ----- 赤で塗りつぶす場合
plt.fill_between(np.linspace(m-5000, M+5000, l), np.linspace(m-5000, M+5000, l), facecolor='red', alpha=0.125)
plt.xlim((m-5000, M+5000))
plt.ylim((m-5000, M+5000))
# -----

# plt.show()

plt.grid()

In [183]:
lgb_predicts

Unnamed: 0,predict,answer
0,116892.579127,100000.0
1,109469.831542,108500.0
2,65470.844381,70000.0
3,78293.687364,79500.0
4,113147.215044,112000.0
...,...,...
1293,81545.370270,83000.0
1294,113412.981175,114000.0
1295,67112.400045,62000.0
1296,58733.357067,56000.0


In [None]:
mean_absolute_error(lgb_predicts["predict"], lgb_predicts["answer"])

In [222]:
# lgb_predicts.to_csv(root+"output/predicts_lgb.csv")

---

In [295]:
model = lgb.LGBMRegressor(
    n_jobs=-1, random_state=42, metric="MSE", 
    learning_rate=0.01, max_depth=8, n_estimators=5000, num_leaves=30,
)

In [296]:
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # クロスバリデーションで学習・予測を行い、予測値とインデックスを保存する
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        tr_x, va_x = np.array(tr_x), np.array(va_x)
        tr_y, va_y = np.array(tr_y), np.array(va_y)
        
        model.fit(tr_x, tr_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)

    # バリデーションデータに対する予測値を連結し、その後元の順序に並べ直す
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]

    # テストデータに対する予測値の平均をとる
    preds_test = np.mean(preds_test, axis=0)

    return pred_train, preds_test

In [297]:
pred_train, pred_test = predict_cv(model, X, y, X_test)

In [298]:
lgb_predicts_train = pd.DataFrame()
lgb_predicts_train["predict"] = pred_train
lgb_predicts_train["answer"] = y.reset_index(drop=True)

lgb_predicts_test = pd.DataFrame()
lgb_predicts_test["predict"] = pred_test
lgb_predicts_test["answer"] = y_test.reset_index(drop=True)

In [300]:
lgb_predicts_train.to_csv(root+"output/lgb_predicts_train.csv", index=False)
lgb_predicts_test.to_csv(root+"output/lgb_predicts_test.csv", index=False)

In [None]:
lgb_predicts_test

In [305]:
mean_absolute_error(lgb_predicts_test["predict"], lgb_predicts_test["answer"])

3614.8358227548983

---

### nlpで補正した予測値のスコア計算

In [311]:
lgb_result = lgb_predicts_test.copy()

In [312]:
nlp_result = pd.read_csv(root+"output/nlp_predicts_house_price_othres.csv")
nlp_result["predict"] = nlp_result["predict"] + lgb_result["predict"]

In [None]:
mean_absolute_error(nlp_result["predict"], lgb_result["answer"])