In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [3]:
input_dir = '/content'
property_data = "v3.csv"

output_data = "output_data.csv"

# データを読み込む
df = pd.read_csv(os.path.join(input_dir, property_data), encoding="utf-8")

df.head()

Unnamed: 0,nearest station,nearest station distance_minutes,間取り,area_square,建物の構造,architectural year,total_price
0,飯田橋,3,１ＬＤＫ,60,ＲＣ,2,130000000
1,飯田橋,3,２ＬＤＫ,60,ＲＣ,2,150000000
2,飯田橋,3,１Ｋ,15,ＳＲＣ,38,13000000
3,飯田橋,2,１ＬＤＫ,45,ＳＲＣ,41,44000000
4,飯田橋,0,３ＬＤＫ,55,ＳＲＣ,39,70000000


In [4]:
# 欠損値除去
df = df.dropna(subset=["nearest station","nearest station distance_minutes","間取り","area_square",'建物の構造','architectural year','total_price'])

df = df.replace({'nearest station distance_minutes': {"1H?1H30": 60}})
df = df.replace({'nearest station distance_minutes': {"1H30?2H": 90}})
df = df.replace({'nearest station distance_minutes': {"2H?": 120}})
df = df.replace({'nearest station distance_minutes': {"30分?60分": 30}})

In [5]:
serial_num = pd.RangeIndex(start=1, stop=len(df.index) + 1, step=1)
df["ID"] = serial_num
df.to_csv(os.path.join(input_dir, output_data))
df.head()

Unnamed: 0,nearest station,nearest station distance_minutes,間取り,area_square,建物の構造,architectural year,total_price,ID
0,飯田橋,3,１ＬＤＫ,60,ＲＣ,2,130000000,1
1,飯田橋,3,２ＬＤＫ,60,ＲＣ,2,150000000,2
2,飯田橋,3,１Ｋ,15,ＳＲＣ,38,13000000,3
3,飯田橋,2,１ＬＤＫ,45,ＳＲＣ,41,44000000,4
4,飯田橋,0,３ＬＤＫ,55,ＳＲＣ,39,70000000,5


In [6]:
t = df["total_price"]

# 学習に必要なカラムを抽出
x = df.iloc[:,[0,1,2,3,4,5]]

x.head()

Unnamed: 0,nearest station,nearest station distance_minutes,間取り,area_square,建物の構造,architectural year
0,飯田橋,3,１ＬＤＫ,60,ＲＣ,2
1,飯田橋,3,２ＬＤＫ,60,ＲＣ,2
2,飯田橋,3,１Ｋ,15,ＳＲＣ,38
3,飯田橋,2,１ＬＤＫ,45,ＳＲＣ,41
4,飯田橋,0,３ＬＤＫ,55,ＳＲＣ,39


In [7]:
x = pd.get_dummies(x)
y = df['total_price']
x.head()

Unnamed: 0,area_square,architectural year,nearest station_お台場海浜公園,nearest station_お花茶屋,nearest station_すずかけ台,nearest station_つくし野,nearest station_つつじケ丘,nearest station_とうきょうスカイツリー,nearest station_ときわ台(東京),nearest station_ひばりケ丘(東京),...,間取り_４ＤＫ,間取り_４ＬＤＫ,間取り_５ＬＤＫ,間取り_６ＤＫ,間取り_６ＬＤＫ,建物の構造_木造,建物の構造_軽量鉄骨造,建物の構造_鉄骨造,建物の構造_ＲＣ,建物の構造_ＳＲＣ
0,60,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,60,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,15,38,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,45,41,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,55,39,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [9]:
pca = PCA(n_components=0.95)
x_pca = pca.fit_transform(x_scaled)

In [10]:
# Random Forest

x_train, x_val, y_train, y_val = train_test_split(x_scaled, y, test_size=0.3, random_state=0)

# ランダムフォレストモデルを作成する
rf_model = RandomForestRegressor(n_estimators=100, random_state=0)

# モデルを訓練する
rf_model.fit(x_train, y_train)

# トレーニングセットでのパフォーマンス
train_r2_score = rf_model.score(x_train, y_train)

In [11]:
# テストセットのスコアを計算する
y_val_pred = rf_model.predict(x_val)
test_r2_score = r2_score(y_val, y_val_pred)

# トレーニングセットとテストセットのスコアを表示する
print("Training set R^2 score:", train_r2_score)
print("Test set R^2 score:", test_r2_score)

Training set R^2 score: 0.9102706886933165
Test set R^2 score: 0.7104323633882877


In [12]:
# 価格推論
pred = rf_model.predict(x)

# 列名付与
y = pd.DataFrame(pred, columns=["predict"])

# 読み込んだ CSV の右端に推論結果を追加
results = pd.concat([df, y], axis=1)

# CSV 出力
output_dir = '/content/predict.csv'
results.to_csv(output_dir, index=False)

