In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import requests
import json
import re

In [8]:
data_from_csv = pd.read_csv("13_Tokyo_20171_20184.csv", encoding='cp932')
data_used_apartment = data_from_csv.query('種類 == "中古マンション等"')

columns_name_list = ["最寄駅：距離（分）", "間取り", "面積（㎡）", "建築年", "建物の構造", "建ぺい率（％）", "容積率（％）", "市区町村名", "取引価格（総額）"]

data_selected = data_used_apartment[columns_name_list]
data_selected_dropna = data_selected.dropna(how='any')  # 1つでもNaNを含む行を削除

data_selected_dropna = data_selected_dropna[data_selected_dropna["建築年"].str.match('^平成|昭和')]

wareki_to_seireki = {'昭和': 1926 - 1, '平成': 1989 - 1}

building_year_list = data_selected_dropna["建築年"]

building_age_list = []
for building_year in building_year_list:
    # 昭和○年 → 昭和, ○ に変換、平成○年 → 平成, ○ に変換
    building_year_split = re.search(r'(.+?)([0-9]+|元)年', building_year)
    # 西暦に変換
    seireki = wareki_to_seireki[building_year_split.groups()[0]] + int(building_year_split.groups()[1])
    
    building_age = 2019 - seireki  # 築年数に変換
    building_age_list.append(building_age)
    
data_selected_dropna["築年数"] = building_age_list # 新しく、築年数列を追加

# もう使わないので、建築年列は削除
data_added_building_age = data_selected_dropna.drop("建築年", axis=1)

In [9]:
# ダミー変数化しないもののリスト
columns_name_list = ["最寄駅：距離（分）", "面積（㎡）", "築年数", "建ぺい率（％）", "容積率（％）", "取引価格（総額）"]

# ダミー変数リスト
dummy_list = ["間取り", "建物の構造", "市区町村名"]

# ダミー変数を追加
data_added_dummies = pd.concat([data_added_building_age[columns_name_list], pd.get_dummies(data_added_building_age[dummy_list], drop_first=True)], axis=1)

# 文字列を数値化
data_added_dummies["面積（㎡）"] = data_added_dummies["面積（㎡）"].astype(float)
data_added_dummies = data_added_dummies[~data_added_dummies['最寄駅：距離（分）'].str.contains('\?')]
data_added_dummies["最寄駅：距離（分）"] = data_added_dummies["最寄駅：距離（分）"].astype(float)

# 6000万円以下のデータのみ抽出
data_added_dummies = data_added_dummies[data_added_dummies["取引価格（総額）"] < 60000000]

In [11]:
x = data_added_dummies.drop("取引価格（総額）", axis=1)
y = data_added_dummies["取引価格（総額）"]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [12]:
lr_multi = LinearRegression()

lr_multi.fit(X_train, y_train)
print(lr_multi.coef_)
print(lr_multi.intercept_)

[-3.08890021e+05  3.63664295e+05 -4.50523212e+05 -3.61024675e+04
  4.96570532e+03  4.82843203e+06  4.74587519e+06  1.82982647e+06
  1.40164556e+06  1.36758899e-07  8.89298270e+06  1.09187674e+07
  1.41544026e+06  7.61122609e+06  8.45903123e+06  6.58954354e+06
  5.58793545e-09  1.08028672e+07  7.91000489e+06  6.70032410e+06
  6.13508746e-08  6.09063084e+06  8.97867348e+06 -5.21540642e-08
  9.81432410e+06  5.18560277e+06  1.11928359e+06  8.39339684e+06
  7.66901074e+06 -7.53513850e+06  2.30629659e+06  7.07897283e+06
  6.98926088e+06  2.30465648e+07  7.02800562e+06  3.53143961e+06
  7.81268610e+06  7.14900804e+06  9.81799179e+06  1.01935001e+07
  8.16702059e+06 -1.04358453e+07  4.02429901e+06  1.05679344e+07
  5.23601929e+06  9.18170309e+06 -2.79556572e+04 -1.04999190e+06
  4.73523003e+06 -7.83505425e+06  5.53750709e+06 -6.94800294e+06
  2.99141014e+06 -2.80713749e+06  9.38521783e+06  9.87504053e+06
 -7.20383318e+06 -1.23667505e+07  7.92950779e+06 -6.11645982e+06
 -1.19865596e+07 -1.04631

In [13]:
y_pred_lr = lr_multi.predict(X_test)

# 残差
print(y_pred_lr - y_test)

# MAE
print(mean_absolute_error(y_pred_lr, y_test))

34465    7.449126e+06
10303    7.864877e+05
2091    -1.673773e+07
31095   -1.061075e+06
13357    1.088434e+07
             ...     
32570   -2.078108e+06
824     -5.012588e+06
57869   -5.397972e+06
12332   -8.463118e+05
40798   -7.170558e+06
Name: 取引価格（総額）, Length: 7584, dtype: float64
4582672.092457856


In [30]:
lasso = Lasso(alpha=1, normalize=True)
lasso.fit(X_train, y_train)

# print(lasso.coef_)
# print(lasso.intercept_)

  positive)


Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)

In [31]:
y_pred_lasso = lasso.predict(X_test)

print(mean_absolute_error(y_pred_lasso, y_test))

4583006.341144557
