In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


### train_test_split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 9069 to 7270
Data columns (total 20 columns):
Id               7000 non-null int64
DistrictId       7000 non-null int64
Rooms            7000 non-null float64
Square           7000 non-null float64
LifeSquare       5514 non-null float64
KitchenSquare    7000 non-null float64
Floor            7000 non-null int64
HouseFloor       7000 non-null float64
HouseYear        7000 non-null int64
Ecology_1        7000 non-null float64
Ecology_2        7000 non-null object
Ecology_3        7000 non-null object
Social_1         7000 non-null int64
Social_2         7000 non-null int64
Social_3         7000 non-null int64
Healthcare_1     3642 non-null float64
Helthcare_2      7000 non-null int64
Shops_1          7000 non-null int64
Shops_2          7000 non-null object
Price            7000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.1+ MB


In [7]:
train.loc[train['Rooms'] == 2, 'LifeSquare'].mean()

35.45968010086168

### Model

In [8]:
from sklearn.ensemble import RandomForestRegressor as RF

In [9]:
feats = ['Rooms', 'Square', 'Floor', 'HouseYear', 'DistrictId', 'HouseFloor']

In [10]:
model = RF(n_estimators=20, max_depth=12, random_state=42)

In [11]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [12]:
pred_train = model.predict(train.loc[:, feats])

In [13]:
pred_train

array([ 95178.38700789, 219245.59159369, 174882.6915718 , ...,
       238787.57937341, 192976.22752773, 385002.842161  ])

In [14]:
pred_train.shape

(7000,)

In [15]:
pred_valid = model.predict(valid.loc[:, feats])

In [16]:
pred_valid

array([204590.11857644, 352074.05992298, 217154.17621543, ...,
       213564.03005214, 146239.17833237, 290014.83848467])

In [17]:
pred_valid.shape

(3000,)

### Evaluate model

In [18]:
from sklearn.metrics import r2_score as r2

In [19]:
r2(train['Price'], pred_train)

0.8812650983395305

In [20]:
r2(valid['Price'], pred_valid)

0.6581863311960889

### Test

In [21]:
test = pd.read_csv('test.csv')

In [22]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [23]:
test.shape

(5000, 19)

In [29]:
test['Price'] = model.predict(test.loc[:, feats])

In [30]:
test.loc[:, ['Id', 'Price']].to_csv('MEgorkin_predictions.csv', index=None)