In [276]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)

root = "../"

In [277]:
l = pd.read_csv(root+"input/list_page_data.csv")
l = l.drop_duplicates()
d = pd.read_csv(root+"input/detail_page_data.csv")
d = d.drop_duplicates()

## リストページ

In [None]:
l.detail_url.value_counts()

In [279]:
keys = l.detail_url.value_counts().keys()
values = l.detail_url.value_counts().values

index = [True] * len(l)

for key, value in zip(keys, values):
    if value > 1:
        index = (index) & (l["detail_url"] != key)
    else:
        break
        
l = l[index]

In [None]:
len(l)

In [None]:
l.head()

## 詳細ページ

In [None]:
d.url.value_counts()

In [None]:
len(d)

In [None]:
d.head(1)

## 結合

In [285]:
temp1 = set(l.detail_url)
temp2 = set(d.url)

In [286]:
# temp1 - temp2

In [287]:
# temp2 - temp1

In [288]:
train = pd.merge(left=l, right=d, left_on="detail_url", right_on="url", how="inner")
train = train.drop(["url_x", "url_y", "detail_url"], axis=1)

In [None]:
train.columns

In [None]:
train.head(1)

## 自然言語処理用

In [294]:
def make_target(df):
    df["kanrihi"] = df["kanrihi"].fillna("0円")
    df["kanrihi"] = train["kanrihi"].map(lambda d: "0円" if d=="-" else d)
    
    kanrihi = df["kanrihi"].map(lambda d: int(d[:-1]))
    yatin = df["yatin"].map(lambda d: float(d[:-2])*10000)
    
    df["target"] = kanrihi + yatin
    return df

In [304]:
def make_others(df):
    others = np.array(df["others"])
    
    features = {}
    features["madori"] = np.array(df["madori"])
    features["madori_detail"] = np.array(df["madori_detail"].str.replace(" ", ""))
    features["kozo"] = np.array(df["kozo"])
    
    # ----- kaidate
    # 「x階/y階建」のような形式
    # 「x階、y階建」としてデータ追加
    features["kaidate_x_kai"] = np.array(df["kaidate"].map(lambda d: d.split("/")[0] if "/" in d else "<S>"))
    features["kaidate_y_kaidate"] = np.array(df["kaidate"].map(lambda d: d.split("/")[1] if "/" in d else d))
    # ----- 
    
    # ----- menseki
    # 「30.21m」みたいな形式
    # 「30m」としてデータ追加
    features["menseki"] = np.array(df["menseki"].map(lambda d: d.split(".")[0] + "m" if "." in d else d))
    # -----
    
    # ----- ekitoho
    # 「xx線/yy駅 歩zz分」みたいな形式
    # 「xx、yy、zz」としてデータ追加
    features["ekitoho_xx_sen"] = np.array(df["ekitoho"].map(lambda d: d.split("/")[0]))
    features["ekitoho_yy_eki"] = np.array(df["ekitoho"].map(lambda d: d.split("/")[1].split(" ")[0]))
    features["ekitoho_zz_hun"] = np.array(df["ekitoho"].map(lambda d: d.split("/")[1].split(" ")[1]))
    # -----
    
    # ----- syozaiti
    # 「xx県yy市zz区aa」みたいな形式
    # 「aa」としてデータ追加
    features["syozaiti_aa"] = np.array(df["syozaiti"].map(lambda d: d.split("区")[1]))
    # -----
    
    # ----- tikunengetu
    # 「xxxx年yy月」みたいな形式
    # 「xxxx年、yy月」としてデータ追加 or 「xxx年」のみ追加する
    features["tikunengetu_xxxx_nen"] = np.array(df["tikunengetu"].map(lambda d: d.split("年")[0]+"年"))
    # -----
    
    for i in range(len(others)):
        for v in features.values():
            others[i] = others[i] + "、" + v[i]
    
    df["others_add_features"] = others
    return df

In [305]:
train = make_others(train)

In [None]:
data = train[[
    "others", "target", "others_add_features"
]]
data.head(2)

In [None]:
data["others_add_features_list"] = data["others_add_features"].map(lambda d: d.split("、"))

In [None]:
data.head(3)

In [311]:
def print_others(df, n=5):
    for i in range(n):
        r = int(np.random.uniform(low=0, high=len(df), size=None))
        print(" ----- ")
        print(df["others"][r])
        print(" ----- ")
        print(df["others_add_features"][r])
        print(" ----- ")
        print(df["others_add_features_list"][r])
        print(" ----- ")
        print()


In [None]:
print_others(data, n=10)

In [314]:
data.to_csv(root+"output/house_price.csv")

In [None]:
def print_data_info(df):
    print("データ数: ", len(df))
    display(df.head())
    print("others: ")
    print(data.loc[100, "others"])
    
print_data_info(data[["others", "target"]])


In [None]:
def print_data_info(df):
    print("データ数: ", len(df))
    display(df.head())
    print("others: ")
    print(data.loc[100, "others"])
    print()
    print("others add features: ")
    print(data.loc[100, "others_add_features"])
    
    
print_data_info(data[["others", "target", "others_add_features"]])
