In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('train.csv')

In [4]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [5]:
data.shape

(10000, 20)

In [6]:
data = data.loc[data['Rooms'] < 10, :]
data = data.loc[data['Price'].between(30000, 600000), :]

In [7]:
data.shape

(9977, 20)

In [8]:
x_year = 1917
data['OldHouse'] = (data['HouseYear'] <= x_year).astype(int)

In [9]:
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price', 'OldHouse'],
      dtype='object')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9977 entries, 0 to 9999
Data columns (total 21 columns):
Id               9977 non-null int64
DistrictId       9977 non-null int64
Rooms            9977 non-null float64
Square           9977 non-null float64
LifeSquare       7867 non-null float64
KitchenSquare    9977 non-null float64
Floor            9977 non-null int64
HouseFloor       9977 non-null float64
HouseYear        9977 non-null int64
Ecology_1        9977 non-null float64
Ecology_2        9977 non-null object
Ecology_3        9977 non-null object
Social_1         9977 non-null int64
Social_2         9977 non-null int64
Social_3         9977 non-null int64
Healthcare_1     5185 non-null float64
Helthcare_2      9977 non-null int64
Shops_1          9977 non-null int64
Shops_2          9977 non-null object
Price            9977 non-null float64
OldHouse         9977 non-null int64
dtypes: float64(8), int64(10), object(3)
memory usage: 1.7+ MB


In [11]:
square_mean_1 = data.loc[data['Rooms']<=1, 'Square'].mean()

In [12]:
square_mean_3 = data.loc[data['Rooms']==3, 'Square'].mean()

In [13]:
data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']<=1), 'Square'] = square_mean_1

data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']==3), 'Square'] = square_mean_3

data.loc[(data['Square'] > 15) & (data['LifeSquare'] < 15), 'LifeSquare'] = data['Square']

data.loc[data['Square'] < data['LifeSquare'], 'LifeSquare'] = data['Square']

data['LifeSquare'] = data['LifeSquare'].fillna(data['Square'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9977 entries, 0 to 9999
Data columns (total 21 columns):
Id               9977 non-null int64
DistrictId       9977 non-null int64
Rooms            9977 non-null float64
Square           9977 non-null float64
LifeSquare       9977 non-null float64
KitchenSquare    9977 non-null float64
Floor            9977 non-null int64
HouseFloor       9977 non-null float64
HouseYear        9977 non-null int64
Ecology_1        9977 non-null float64
Ecology_2        9977 non-null object
Ecology_3        9977 non-null object
Social_1         9977 non-null int64
Social_2         9977 non-null int64
Social_3         9977 non-null int64
Healthcare_1     5185 non-null float64
Helthcare_2      9977 non-null int64
Shops_1          9977 non-null int64
Shops_2          9977 non-null object
Price            9977 non-null float64
OldHouse         9977 non-null int64
dtypes: float64(8), int64(10), object(3)
memory usage: 2.0+ MB


In [14]:
data = data.drop('Healthcare_1', axis=1)

In [15]:
data = pd.get_dummies(data)

In [16]:
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Price', 'OldHouse',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B'],
      dtype='object')

In [17]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [18]:
train.shape

(6983, 23)

In [19]:
valid.shape

(2994, 23)

In [20]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Price', 'OldHouse',
       'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A',
       'Shops_2_B'],
      dtype='object')

In [21]:
data['DistrictId'].nunique()

205

In [22]:
train['Square_2'] = train['Square'] ** 2
valid['Square_2'] = valid['Square'] ** 2

In [23]:
train['LifeSquare_2'] = train['LifeSquare'] ** 2
valid['LifeSquare_2'] = valid['LifeSquare'] ** 2

In [24]:
district_stat = train.groupby(['DistrictId', 'Rooms'])[['Price']].mean().reset_index().rename(columns={'Price':'mean_price'})

In [25]:
district_stat.head()

Unnamed: 0,DistrictId,Rooms,mean_price
0,0,1.0,157397.516309
1,0,2.0,201344.735483
2,0,3.0,263525.24692
3,1,1.0,147719.769156
4,1,2.0,200711.00362


In [26]:
district_stat.shape

(630, 3)

In [27]:
train = pd.merge(train, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [28]:
train['mean_price'].isnull().sum()

0

In [29]:
valid = pd.merge(valid, district_stat, on=['DistrictId', 'Rooms'], how='left')

In [30]:
valid['mean_price'].isnull().sum()

51

In [31]:
room_stat = train.groupby(['Rooms'])[['Price']].mean().reset_index().rename(columns={'Price': 'mean_price2'})

In [32]:
train = pd.merge(train, room_stat, on=['Rooms'], how='left')

In [33]:
valid = pd.merge(valid, room_stat, on='Rooms', how='left')

In [34]:
train.loc[train['Rooms'] == 5, 'Price'].mean()

385104.7267524139

In [35]:
valid.loc[valid['mean_price2'].isnull(), :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,LifeSquare_2,mean_price,mean_price2
1499,14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,...,0,1,0,1,0,1,3530.06306,1497.863724,,


In [36]:
valid.loc[valid['Rooms'] == 6, 'mean_price2'] = train.loc[train['Rooms'] == 5, 'Price'].mean()

In [37]:
valid['mean_price'] = valid['mean_price'].fillna(valid['mean_price2'])

In [38]:
valid.loc[valid['Rooms'] == 6, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_2,LifeSquare_2,mean_price,mean_price2
1499,14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,...,0,1,0,1,0,1,3530.06306,1497.863724,385104.726752,385104.726752


In [39]:
fts = ['Square', 'LifeSquare', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Ecology_2_A',
       'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B', 
       'KitchenSquare', 'Floor', 'mean_price', 'Rooms', 'OldHouse']

In [40]:
from sklearn.ensemble import RandomForestRegressor as RF

In [41]:
max_r2_valid = 0
max_depth_best = None
for i in range(3, 17):
    print('max_depth = {}'.format(i))
    rf = RF(n_estimators=10, max_depth=i, min_samples_leaf=2, random_state=42)
    rf.fit(train.loc[:, fts], train['Price'])
    pred = rf.predict(train.loc[:, fts])
    print(r2(train['Price'], pred))
    pred_valid = rf.predict(valid.loc[:, fts])
    r2_valid = r2(valid['Price'], pred_valid)
    print(r2_valid)
    print()
    if r2_valid > max_r2_valid:
        max_r2_valid = r2_valid
        max_depth_best = i
        
print('max_depth_best: {}, max_r2_valid: {}'.format(max_depth_best, max_r2_valid))

max_depth = 3
0.6861136266508007
0.5760178219697683

max_depth = 4
0.7259363645593668
0.6064689001677606

max_depth = 5
0.7583180166879
0.626187428665237

max_depth = 6
0.7865527111325338
0.6423039318210199

max_depth = 7
0.8130751168043284
0.6541218519661383

max_depth = 8
0.8383353578672608
0.6612587079535297

max_depth = 9
0.8600166493324634
0.6657280462999102

max_depth = 10
0.8786747058617055
0.6699732108808931

max_depth = 11
0.8939362042527066
0.6688745057919756

max_depth = 12
0.9065022251151533
0.6697985195756543

max_depth = 13
0.9160701188959697
0.6688204038025984

max_depth = 14
0.9224905729674042
0.6698725021555113

max_depth = 15
0.9265279526900908
0.6690918648609563

max_depth = 16
0.9299509285403016
0.6667884115285516

max_depth_best: 10, max_r2_valid: 0.6699732108808931


#### RF

In [42]:
from sklearn.ensemble import RandomForestRegressor as RF

rf = RF(n_estimators=20, max_depth=10, min_samples_leaf=2, random_state=42)

In [43]:
rf.fit(train.loc[:, fts], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [44]:
pred = rf.predict(train.loc[:, fts])

In [45]:
pred

array([399072.7859086 , 243796.11905026, 350700.84553187, ...,
       243861.07932366, 165155.99812563, 161329.27386174])

In [46]:
r2(train['Price'], pred)

0.8840280102884747

In [47]:
pred_valid = rf.predict(valid.loc[:, fts])

In [48]:
pred_valid

array([203039.79526812, 162995.55865776, 251824.58570957, ...,
       182842.62499293, 230839.94089806, 131975.66711111])

In [49]:
r2(valid['Price'], pred_valid)

0.6722250006669932

#### Предсказание на тесте

In [50]:
test = pd.read_csv('test.csv')

In [51]:
test.shape

(5000, 19)

In [52]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [53]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [54]:
test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (test['Rooms']<=1), 'Square'] = square_mean_1

test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (test['Rooms']==3), 'Square'] = square_mean_3

test.loc[(test['Square'] > 15) & (test['LifeSquare'] < 15), 'LifeSquare'] = test['Square']

test.loc[test['Square'] < test['LifeSquare'], 'LifeSquare'] = test['Square']

test['LifeSquare'] = test['LifeSquare'].fillna(data['Square'])

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       4999 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [55]:
test['LifeSquare'] = test['LifeSquare'].fillna(data['LifeSquare'].mean())

In [56]:
test = pd.get_dummies(test)

In [57]:
pred_test = rf.predict(test.loc[:, fts])

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
pred_test

In [None]:
pred_test.shape

In [None]:
test['Price'] = pred_test

In [None]:
test.head()

In [None]:
#test.loc[:, ['Id', 'Price']].to_csv('output/SShirkin_predictions.csv', index=None)