In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import requests
import json
import re

### APIでデータ読み込み

In [0]:
url_path = "https://www.land.mlit.go.jp/webland/api/TradeListSearch?from=20171&to=20185&area=13"
request_result = requests.get(url_path)
data_json = request_result.json()["data"]

In [0]:
print(len(data_json))

In [0]:
print(data_json[0])

In [0]:
print(data_json[1000])

In [0]:
data_pd = pd.io.json.json_normalize(data_json)
print(data_pd.shape)

In [0]:
print(data_pd.head(10))

In [0]:
print(data_pd.isnull().sum())

### CSVファイルからデータ読み込み

In [0]:
data_from_csv = pd.read_csv("13_Tokyo_20171_20184.csv", encoding='cp932')

In [0]:
print(data_from_csv.shape)

In [0]:
print(data_from_csv.iloc[0])

In [0]:
print(data_from_csv.head(10))

### データ整形

In [0]:
print(data_from_csv["種類"].unique())

In [0]:
data_used_apartment = data_from_csv.query('種類 == "中古マンション等"')
print(data_used_apartment.shape)
print(data_used_apartment.head())
print(data_used_apartment.iloc[0])

In [0]:
print(data_used_apartment.isnull().sum())

In [0]:
columns_name_list = ["最寄駅：距離（分）", "間取り", "面積（㎡）","建築年", "建物の構造", "建ぺい率（％）", "容積率（％）", "市区町村名", "取引価格（総額）"]

data_selected = data_used_apartment[columns_name_list]
print(data_selected.shape)

data_selected_dropna = data_selected.dropna(how='any') # 一つでもNANデータを含む行を削除
print(data_selected_dropna.shape)
print(data_selected_dropna.iloc[0])

In [0]:
data_selected_dropna["建築年"].unique()

In [0]:
data_selected_dropna = data_selected_dropna[data_selected_dropna["建築年"].str.match('^平成|昭和')]


wareki_to_seireki = {'昭和': 1926-1, '平成': 1989-1}

building_year_list = data_selected_dropna["建築年"]

building_age_list = []
for building_year in building_year_list:
    # 昭和○年 → 昭和, ○ に変換、平成○年 → 平成, ○ に変換
    building_year_split = re.search(r'(.+?)([0-9]+|元)年', building_year)
    # 西暦に変換
    seireki = wareki_to_seireki[building_year_split.groups()[0]] + int(building_year_split.groups()[1])
   
    building_age = 2018 - seireki # 築年数に変換
    building_age_list.append(building_age)

    
data_selected_dropna["築年数"] = building_age_list # 新しく、築年数列を追加

# もう使わないので、建築年列は削除
data_added_building_age = data_selected_dropna.drop("建築年", axis=1)
print(data_added_building_age.head())

In [0]:
# ダミー変数化しないもののリスト
columns_name_list = ["最寄駅：距離（分）", "面積（㎡）","築年数", "建ぺい率（％）", "容積率（％）", "取引価格（総額）"]

# ダミー変数化するリスト
dummy_list = ["間取り", "建物の構造", "市区町村名"]

# ダミー変数を追加
data_added_dummies = pd.concat([data_added_building_age[columns_name_list],
                   pd.get_dummies(data_added_building_age[dummy_list], drop_first=True)], axis=1)

print(data_added_dummies.shape)
print(data_added_dummies.iloc[0])

In [0]:
print(data_added_dummies.dtypes)

In [0]:
data_added_dummies["面積（㎡）"] = data_added_dummies["面積（㎡）"].astype(float)
data_added_dummies = data_added_dummies[~data_added_dummies['最寄駅：距離（分）'].str.contains('\?')]
data_added_dummies["最寄駅：距離（分）"] = data_added_dummies["最寄駅：距離（分）"].astype(float)

In [0]:
print(data_added_dummies.dtypes)

## 可視化

In [0]:
plt.hist(data_added_dummies["取引価格（総額）"])
plt.show()

tmp_data = data_added_dummies[data_added_dummies["取引価格（総額）"] < 60000000]
print(tmp_data.shape)
plt.hist(tmp_data["取引価格（総額）"])
plt.show()

In [0]:
data_added_dummies = data_added_dummies[data_added_dummies["取引価格（総額）"] < 60000000]

## 線形回帰を実践

In [0]:
lr = LinearRegression()

x_column_list = ['面積（㎡）']
y_column_list = ['取引価格（総額）']

x = data_added_dummies[x_column_list]
y = data_added_dummies[y_column_list]

lr.fit(x, y)

In [0]:
print(lr.coef_)
print(lr.intercept_)

### 重回帰分析

In [0]:
x = data_added_dummies.drop("取引価格（総額）", axis=1)
y = data_added_dummies["取引価格（総額）"]

print(x.head())
print(y.head())

In [0]:
lr_multi = LinearRegression()
lr_multi.fit(x, y)

print(lr_multi.coef_)
print(lr_multi.intercept_)

In [0]:
for i in range(len(lr_multi.coef_)):
  print(x.columns[i], lr_multi.coef_[i])

## 予測

In [0]:
x = data_added_dummies.drop("取引価格（総額）", axis=1)
y = data_added_dummies["取引価格（総額）"]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [0]:
lr_multi2 = LinearRegression()

lr_multi2.fit(X_train, y_train) 
print(lr_multi2.coef_)
print(lr_multi2.intercept_)

In [0]:
y_pred = lr_multi2.predict(X_test)
print(y_pred)

In [0]:
print(y_pred - y_test)

In [0]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

## MAE

In [0]:
from sklearn.metrics import mean_absolute_error

x_column_list = ['面積（㎡）']
y_column_list = ['取引価格（総額）']


X_train, X_test, y_train, y_test = train_test_split(data_added_dummies[x_column_list], data_added_dummies[y_column_list], test_size=0.3)

lr_single = LinearRegression()

lr_single.fit(X_train, y_train) 
y_pred = lr_single.predict(X_test)

print(mean_absolute_error(y_pred, y_test))

In [0]:
x = data_added_dummies.drop("取引価格（総額）", axis=1)
y = data_added_dummies["取引価格（総額）"]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.7)

lr_multi2 = LinearRegression()

lr_multi2.fit(X_train, y_train) 
y_pred = lr_multi2.predict(X_test)

print(mean_absolute_error(y_pred, y_test))