## 決定木デフォルト

In [1]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.metrics import  mean_squared_error
from sklearn.model_selection import train_test_split

import graphviz

In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(55369, 13)

In [4]:
train_pkl.describe()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,host_response_rate,number_of_reviews,y,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,room_type_Private room,room_type_Shared room
count,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0,55369.0
mean,27785.124745,3.156297,1.194766,1.265654,-0.260742,0.204013,160.26383,0.25722,0.437302,0.001355,0.000235,0.41395,0.029114
std,16043.410949,2.154443,0.547776,0.850044,0.439043,0.752392,168.239385,0.437105,0.496058,0.03678,0.015321,0.492544,0.168127
min,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13889.0,2.0,1.0,1.0,-1.0,0.0,74.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27778.0,2.0,1.0,1.0,0.0,0.0,111.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,41679.0,4.0,1.0,1.0,0.0,0.0,185.0,1.0,1.0,0.0,0.0,1.0,0.0
max,55582.0,16.0,8.0,10.0,0.0,15.0,1999.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
train_pkl.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,host_response_rate,number_of_reviews,y,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,room_type_Private room,room_type_Shared room
0,0,6,2,1,-1,0,138.0,0,0,0,0,1,0
1,1,2,1,1,0,0,42.0,0,1,0,0,1,0
2,2,2,2,1,0,0,65.0,0,1,0,0,1,0
3,3,2,1,1,0,0,166.0,0,1,0,0,1,0
4,4,2,1,1,0,0,165.0,0,1,0,0,0,0


## 訓練データとテストデータに分割

In [6]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [7]:
# ターゲットと特徴量の分割
train_X_tmp = train.copy()
train_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
train_X = train_X_tmp.iloc[:].values
train_y = train.y.values

In [8]:
train_X.shape, train_y.shape, train_X_tmp.shape

((44295, 11), (44295,), (44295, 11))

## 訓練データで訓練

In [9]:
DT = tree.DecisionTreeRegressor(max_depth=5, random_state=42)

In [10]:
DT = DT.fit(train_X, train_y)

In [11]:
train.y.name

'y'

In [12]:
DT.feature_importances_

array([0.02860961, 0.32223458, 0.50957536, 0.02636854, 0.        ,
       0.0028995 , 0.        , 0.        , 0.00650685, 0.08331213,
       0.02049343])

In [13]:
# train_X.columns

In [14]:
# 特徴量の重要度が高い順に表示
print("特徴量の重要度が高い順：")
# sorted：reverse=True 降順
print(sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, [2, 3, 4, 5, 7, 8, 9, 10, 11]].columns),
    reverse=True))

特徴量の重要度が高い順：
[(0.51, 'host_response_rate'), (0.322, 'bedrooms'), (0.029, 'bathrooms'), (0.026, 'number_of_reviews'), (0.007, 'room_type_Private room'), (0.003, 'cancellation_policy_strict'), (0.0, 'cancellation_policy_super_strict_60'), (0.0, 'cancellation_policy_super_strict_30'), (0.0, 'cancellation_policy_moderate')]


In [15]:
# len(train.iloc[:, 1:].columns), train.iloc[:, 1:].columns

In [16]:
# 訓練済みの決定木を視覚化
# dot_data = tree.export_graphviz(DT, out_file=None,
#                                feature_names=train.iloc[:, 1:-1].columns,
#                                class_names=train.y.name,
#                                rounded=True,
#                                filled=True,
#                                special_characters=True)

In [17]:
# graph = graphviz.Source(dot_data)
# graph

In [18]:
# graph.write('.\DT.png')

In [19]:
type(train_X)

numpy.ndarray

In [21]:
sorted(
    zip(map(lambda x: round(x, 3), DT.feature_importances_), train.iloc[:, [2, 3, 4, 5, 7, 8, 9, 10, 11]].columns),
    reverse=True)

[(0.51, 'host_response_rate'),
 (0.322, 'bedrooms'),
 (0.029, 'bathrooms'),
 (0.026, 'number_of_reviews'),
 (0.007, 'room_type_Private room'),
 (0.003, 'cancellation_policy_strict'),
 (0.0, 'cancellation_policy_super_strict_60'),
 (0.0, 'cancellation_policy_super_strict_30'),
 (0.0, 'cancellation_policy_moderate')]

## テストデータで実行

In [39]:
# ターゲットと特徴量の分割
test_X_tmp = test.copy()
test_X_tmp.drop(['y'], axis=1, inplace=True)
test_x = test_X_tmp.iloc[2:].values
test_y = test.y.values

In [36]:
test_x.shape, test_y.shape

((11074, 11), (11074,))

In [37]:
pred_y = DT.predict(test_x)

In [38]:
pred_y.shape

(11074,)

In [26]:
np.sqrt(mean_squared_error(test_y, pred_y))

129.13976374120173

In [27]:
test.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,host_response_rate,number_of_reviews,y,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,room_type_Private room,room_type_Shared room
20499,20499,6,2,3,0,0,500.0,0,1,0,0,0,0
35590,35590,5,2,2,0,0,450.0,0,1,0,0,0,0
36928,36928,2,1,1,0,0,65.0,0,1,0,0,1,0
7213,7213,6,1,1,0,0,180.0,1,0,0,0,0,0
12771,12771,8,1,2,0,0,168.0,0,0,0,0,0,0


## 検証データで実行

In [28]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [29]:
valid.shape

(18528, 12)

## 元ファイルと結果をマージ

In [32]:
train_pd = pd.read_csv('../000_data/train.csv')

In [41]:
test_X_tmp.head()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,host_response_rate,number_of_reviews,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,room_type_Private room,room_type_Shared room
20499,20499,6,2,3,0,0,0,1,0,0,0,0
35590,35590,5,2,2,0,0,0,1,0,0,0,0
36928,36928,2,1,1,0,0,0,1,0,0,1,0
7213,7213,6,1,1,0,0,1,0,0,0,0,0
12771,12771,8,1,2,0,0,0,0,0,0,0,0


In [94]:
train_val = pd.DataFrame(test_X_tmp.id)

In [95]:
train_val = train_val.reset_index()

In [96]:
train_val.head()

Unnamed: 0,index,id
0,20499,20499
1,35590,35590
2,36928,36928
3,7213,7213
4,12771,12771


In [111]:
pred_pd = pd.DataFrame(pred_y)

In [112]:
type(train_val), type(pred_pd)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

In [113]:
train_val = pd.concat([train_val, pred_pd], axis=1)

In [114]:
train_val.columns

Index(['index', 'id', 'y', 0], dtype='object')

In [115]:
train_val = train_val.rename(columns={0 : 'y_'})

In [116]:
train_val.columns

Index(['index', 'id', 'y', 'y_'], dtype='object')

In [118]:
result_test = pd.merge(train_pd, train_val[['id', 'y_']], on='id', how='inner')

In [119]:
result_test.head()

Unnamed: 0,id,accommodates,amenities,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,cleaning_fee,...,name,neighbourhood,number_of_reviews,property_type,review_scores_rating,room_type,thumbnail_url,zipcode,y,y_
0,4,2,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.0,Real Bed,1.0,1.0,strict,NYC,t,...,Charming 1-bedroom - UWS Manhattan,Upper West Side,5,Apartment,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/92879730/5...,10024.0,165.0,149.676086
1,6,2,"{Internet,""Wireless Internet"",Kitchen,""Free pa...",1.5,Real Bed,1.0,1.0,moderate,NYC,t,...,Cozy Artist Bedroom less than 30 min to Manhattan,Bedford-Stuyvesant,65,House,91.0,Private room,https://a0.muscache.com/im/pictures/79595629/9...,11233.0,48.0,84.577645
2,31,4,"{TV,""Wireless Internet"",""Air conditioning"",""Wh...",1.0,Real Bed,1.0,2.0,strict,NYC,t,...,TriBeCa/City Hall 1 BR with view,Chinatown,2,Apartment,100.0,Entire home/apt,https://a0.muscache.com/im/pictures/7f3ca728-f...,10013.0,135.0,184.78319
3,34,16,"{Internet,""Wireless Internet"",""Air conditionin...",5.5,Real Bed,5.0,10.0,strict,LA,f,...,Beverly Hills Mansion,,0,House,,Entire home/apt,,90211.0,1995.0,943.857143
4,38,3,"{TV,Internet,""Wireless Internet"",Kitchen,""Indo...",1.5,Real Bed,1.0,1.0,strict,SF,t,...,"Gorgeous Mission Loft, Great Light!",Mission District,22,Loft,98.0,Entire home/apt,https://a0.muscache.com/im/pictures/105786272/...,94110.0,399.0,149.676086


In [120]:
result_test.describe()

Unnamed: 0,id,accommodates,bathrooms,bedrooms,beds,latitude,longitude,number_of_reviews,review_scores_rating,y,y_
count,11074.0,11074.0,11074.0,11074.0,11064.0,11074.0,11074.0,11074.0,8560.0,11074.0,11074.0
mean,27725.730269,3.167509,1.235326,1.263049,1.709689,38.433408,-92.610194,20.76955,94.099416,159.450515,160.969473
std,16047.460856,2.178113,0.587448,0.854477,1.270712,3.101306,21.752794,38.174252,7.739058,166.803644,111.841123
min,4.0,1.0,0.0,0.0,1.0,33.707014,-122.5115,0.0,20.0,1.0,51.773913
25%,13777.25,2.0,1.0,1.0,1.0,34.122174,-118.345062,1.0,92.0,75.0,84.577645
50%,27857.5,2.0,1.0,1.0,1.0,40.66294,-76.99882,5.0,96.0,112.0,149.676086
75%,41482.5,4.0,1.0,1.0,2.0,40.746812,-73.954129,23.0,100.0,185.0,184.78319
max,55570.0,16.0,8.0,10.0,16.0,42.389682,-71.001769,532.0,100.0,1995.0,1650.0


In [121]:
result_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11074 entries, 0 to 11073
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      11074 non-null  int64  
 1   accommodates            11074 non-null  int64  
 2   amenities               11074 non-null  object 
 3   bathrooms               11074 non-null  float64
 4   bed_type                11074 non-null  object 
 5   bedrooms                11074 non-null  float64
 6   beds                    11064 non-null  float64
 7   cancellation_policy     11074 non-null  object 
 8   city                    11074 non-null  object 
 9   cleaning_fee            11074 non-null  object 
 10  description             11074 non-null  object 
 11  first_review            8686 non-null   object 
 12  host_has_profile_pic    11041 non-null  object 
 13  host_identity_verified  11041 non-null  object 
 14  host_response_rate      8392 non-null 

In [123]:
result_test.columns

Index(['id', 'accommodates', 'amenities', 'bathrooms', 'bed_type', 'bedrooms',
       'beds', 'cancellation_policy', 'city', 'cleaning_fee', 'description',
       'first_review', 'host_has_profile_pic', 'host_identity_verified',
       'host_response_rate', 'host_since', 'instant_bookable', 'last_review',
       'latitude', 'longitude', 'name', 'neighbourhood', 'number_of_reviews',
       'property_type', 'review_scores_rating', 'room_type', 'thumbnail_url',
       'zipcode', 'y', 'y_'],
      dtype='object')

In [125]:
result_test = result_test.dropna(subset=['y'])

In [129]:
result_test['diff'] = result_test.y - result_test.y_

In [132]:
result_test['diff(abs)'] = np.abs(result_test['diff'])

In [133]:
result_test.shape

(11074, 32)

In [134]:
result_test.to_csv('test_result.csv')

In [136]:
result_test.to_excel('test_result.xlsx')

In [28]:
# ID の保存
valid_pass = valid.id.values

In [29]:
valid_X = valid.iloc[:, 1:].values

In [31]:
# valid_X.describe()

In [32]:
valid_X.shape, train_X.shape

((18528, 11), (44295, 11))

In [33]:
pred_valid_y = DT.predict(valid_X)

In [34]:
pred_valid_y.shape

(18528,)

In [35]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [36]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [37]:
result_df.head()

Unnamed: 0,y
0,266.6469
1,149.676086
2,84.577645
3,184.78319
4,149.676086


In [38]:
result_df.to_csv("./tree_3.csv", header=False)