In [9]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_log_error

In [10]:
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=[1])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=[1])

In [11]:
for df in train_df, test_df:
    df['timestamp_year'] = df['timestamp'].dt.year
    df['timestamp_month'] = df['timestamp'].dt.month
    df['timestamp_day'] = df['timestamp'].dt.day
    df.drop(labels='timestamp', axis=1, inplace=True)

In [12]:
cat_columns = train_df.select_dtypes(include='object').columns
train_df[cat_columns].apply(pd.unique)

product_type                                       [Investment, OwnerOccupier]
sub_area                     [Bibirevo, Nagatinskij Zaton, Tekstil'shhiki, ...
culture_objects_top_25                                               [no, yes]
thermal_power_plant_raion                                            [no, yes]
incineration_raion                                                   [no, yes]
oil_chemistry_raion                                                  [no, yes]
radiation_raion                                                      [no, yes]
railroad_terminal_raion                                              [no, yes]
big_market_raion                                                     [no, yes]
nuclear_reactor_raion                                                [no, yes]
detention_facility_raion                                             [no, yes]
water_1line                                                          [no, yes]
big_road1_1line                                     

In [13]:
train_df['product_type'] = train_df['product_type'].map({'Investment': 0, 'OwnerOccupier':1})
train_df[cat_columns[2:-1]] = train_df[cat_columns[2:-1]].applymap(lambda x: 0 if x=='no' else 1)

eco_map = {'no data': 0,
           'poor': 1,
           'satisfactory': 2,
           'good': 3,
           'excellent': 4,}

train_df['ecology'] = train_df['ecology'].map(eco_map)

In [14]:
train_df[cat_columns].apply(pd.unique)

product_type                                                            [0, 1]
sub_area                     [Bibirevo, Nagatinskij Zaton, Tekstil'shhiki, ...
culture_objects_top_25                                                  [0, 1]
thermal_power_plant_raion                                               [0, 1]
incineration_raion                                                      [0, 1]
oil_chemistry_raion                                                     [0, 1]
radiation_raion                                                         [0, 1]
railroad_terminal_raion                                                 [0, 1]
big_market_raion                                                        [0, 1]
nuclear_reactor_raion                                                   [0, 1]
detention_facility_raion                                                [0, 1]
water_1line                                                             [0, 1]
big_road1_1line                                     

In [15]:
train_df.head()

Unnamed: 0_level_0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc,timestamp_year,timestamp_month,timestamp_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,43,27.0,4.0,,,,,,,0,...,13,22,1,0,52,4,5850000,2011,8,20
2,34,19.0,3.0,,,,,,,0,...,15,29,1,10,66,14,6000000,2011,8,23
3,43,29.0,2.0,,,,,,,0,...,11,27,0,4,67,10,5700000,2011,8,27
4,89,50.0,9.0,,,,,,,0,...,4,4,0,0,26,3,13100000,2011,9,1
5,77,77.0,4.0,,,,,,,0,...,135,236,2,91,195,14,16331452,2011,9,5


In [16]:
test_df.head()

Unnamed: 0_level_0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,timestamp_year,timestamp_month,timestamp_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30474,39.0,20.7,2,9,1,1998.0,1,8.9,3.0,Investment,...,0,1,10,1,0,14,1,2015,7,1
30475,79.2,,8,17,1,0.0,3,1.0,1.0,OwnerOccupier,...,0,2,11,0,1,12,1,2015,7,1
30476,40.5,25.1,3,5,2,1960.0,2,4.8,2.0,Investment,...,0,10,21,0,10,71,11,2015,7,1
30477,62.8,36.0,17,17,1,2016.0,2,62.8,3.0,OwnerOccupier,...,0,0,10,0,0,2,0,2015,7,1
30478,40.0,40.0,17,17,1,0.0,1,1.0,1.0,OwnerOccupier,...,0,2,12,0,1,11,1,2015,7,1


Missing data

In [18]:
idx = train_df.index.tolist()
set(np.arange(idx[0], idx[-1])) - set(idx)

{1252, 1268}

In [19]:
train_df.loc[1251:1253]

Unnamed: 0_level_0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc,timestamp_year,timestamp_month,timestamp_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1251,30,20.0,1.0,,,,,,,0,...,8,30,1,9,63,11,5200000,2012,2,20
1253,50,30.0,8.0,,,,,,,0,...,2,10,1,0,23,3,4125000,2012,2,20


In [20]:
idx = test_df.index.tolist()
set(np.arange(idx[0], idx[-1])) - set(idx)

set()

In [21]:
X_train, y_train = train_df.values[:, 0:-1], train_df.values[:, -1]
X_train.shape, y_train.shape

((30471, 292), (30471,))

In [22]:
X_test= test_df.values
X_test.shape

(7662, 292)

In [None]:
X_train, y_train = train_df.drop(['sub_area'], axis=1).values[:, 0:-1], train_df.values[:, -1]



clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)