In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
def square(df):
# Исправляем сомнительную площадь
    df.loc[(df['Square'] < 20) & (df['Square'] < df['LifeSquare']), 'Square'] = df['LifeSquare']
    return df

In [3]:
def rooms(df):
# Корректируем кол-во комнат со значением 0 и > 5, исходя из площади
    df.loc[df['Rooms'] > 5, 'Rooms'] = 0
    df.loc[(df['Square'] > 115) & (df['Rooms'] == 0), 'Rooms'] = 5
    df.loc[(df['Square'] > 88) & (df['Rooms'] == 0), 'Rooms'] = 4
    df.loc[(df['Square'] > 66) & (df['Rooms'] == 0), 'Rooms'] = 3
    df.loc[(df['Square'] > 48) & (df['Rooms'] == 0), 'Rooms'] = 2
    df.loc[df['Rooms'] == 0, 'Rooms'] = 1
    return df

In [4]:
def house_floor(df):
# Нулевую этажность дома заменяем этажом квартиры
# Этаж квартиры больше этажности дома заменяем этажом дома или наоборот
    df.loc[df['HouseFloor'] == 0, 'HouseFloor'] = df[df['HouseFloor'] == 0].Floor
    df.loc[df['Floor'] > df['HouseFloor'], 'HouseFloor'] = df['Floor']
    return df

In [5]:
def house_year(df):
# Допускаем, что года 2019 и 2020 - квартиры планируемые к сдаче.
# Некорректные даты корректируем.
    df.loc[df['HouseYear'] == 4968, 'HouseYear'] = 1968
    df.loc[df['HouseYear'] == 20052011, 'HouseYear'] = 2008
    return df

In [6]:
def to_int(df):
    # Приведение признаков к int
    df['Ecology_2'] = (df['Ecology_2'] == 'A').astype(int)
    df['Ecology_3'] = (df['Ecology_3'] == 'A').astype(int)
    df['Shops_2'] = (df['Shops_2'] == 'A').astype(int)
    return df

In [7]:
def prepare(df):
    df = square(df)
    df = rooms(df)
    df = house_floor(df)
    df = house_year(df)
    df = to_int(df)
    return df

In [8]:
df = pd.read_csv('input/train.csv')
df = prepare(df)

In [9]:
df, test = train_test_split(df, test_size=0.25, random_state=3)
train, valid = train_test_split(df, test_size=0.25, random_state=344)
print(f'train = {train.shape}. valid = {valid.shape}. test = {test.shape}.')

train = (5625, 20). valid = (1875, 20). test = (2500, 20).


In [10]:
feat = ['DistrictId', 'Square', 'Floor', 'HouseFloor', 'HouseYear',
        'Ecology_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1']

In [11]:
model = RandomForestRegressor(n_estimators=300,
                              max_depth=24,
                              max_features=4,
                              min_samples_leaf=1,
                              min_samples_split=5,
                              min_impurity_decrease=0.1,
                              random_state=443,
                              n_jobs=-1)

In [12]:
model.fit(train.loc[:, feat], train['Price'])
print(pd.DataFrame({'Важность': model.feature_importances_},
                   index=feat).nlargest(len(feat), 'Важность'))

             Важность
Square       0.458526
Social_2     0.128076
Social_3     0.086659
DistrictId   0.076359
HouseYear    0.062839
Ecology_1    0.051053
HouseFloor   0.040665
Floor        0.037108
Shops_1      0.033570
Helthcare_2  0.025145


In [13]:
r2_train = r2_score(train['Price'], model.predict(train.loc[:, feat]))
r2_valid = r2_score(valid['Price'], model.predict(valid.loc[:, feat]))
r2_test = r2_score(test['Price'], model.predict(test.loc[:, feat]))
print(f'train = {r2_train}. valid = {r2_valid}. test = {r2_test}.')

train = 0.9310147666903942. valid = 0.7506711299029404. test = 0.7531103312431794.


In [14]:
model.fit(df.loc[:, feat], df['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=24,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.1,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=-1, oob_score=False, random_state=443,
           verbose=0, warm_start=False)

In [15]:
r2_train = r2_score(train['Price'], model.predict(train.loc[:, feat]))
r2_valid = r2_score(valid['Price'], model.predict(valid.loc[:, feat]))
r2_test = r2_score(test['Price'], model.predict(test.loc[:, feat]))
print(f'train = {r2_train}. valid = {r2_valid}. test = {r2_test}.')

train = 0.9323708483483609. valid = 0.9400401248266173. test = 0.7647775798645988.


In [18]:
data = pd.read_csv('input/test.csv')
data = prepare(data)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null int32
Ecology_3        5000 non-null int32
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null int32
dtypes: float64(7), int32(3), int64(9)
memory usage: 683.7 KB


In [20]:
data['Price'] = model.predict(data.loc[:,feat])

In [21]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,0,0,11,2748,1,,0,0,0,161695.725556
1,15856,74,2.0,69.263183,,1.0,6,6.0,1977,0.075779,0,0,6,1437,3,,0,2,0,233108.211406
2,5480,190,1.0,15.948246,15.948246,12.0,2,5.0,1909,0.0,0,0,30,7538,87,4702.0,5,5,0,289154.711984
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,0,0,23,4583,3,,3,3,0,353369.999284
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,0,0,2,629,1,,0,0,1,143921.820028


In [23]:
data[['Id', 'Price']].to_csv('YKhoroshylov_predictions.csv', index=None)