In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

pd.options.display.precision = 3

#### Загрузим и исследуем исходные данные

In [2]:
origin_train = pd.read_csv('train.csv')
origin_train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.982,29.443,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.931
1,15053,41,3.0,65.684,40.05,8.0,7,9.0,1978,6.999e-05,B,B,46,10309,1,240.0,1,16,B,300009.45
2,4765,53,2.0,44.948,29.198,0.0,8,12.0,1968,0.04964,B,B,34,7759,0,229.0,1,3,B,220925.909
3,5809,58,2.0,53.353,52.732,9.0,8,17.0,1977,0.4379,B,B,23,5735,3,1084.0,0,5,B,175616.227
4,10783,99,1.0,39.649,23.776,7.0,11,12.0,1976,0.01234,B,B,35,5776,1,2078.0,2,4,B,150226.532


In [3]:
origin_train.describe(include = 'all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Id,10000.0,,,,8380.0,4860.0,0.0,4170.0,8390.0,12600.0,16800.0
DistrictId,10000.0,,,,50.4,43.6,0.0,20.0,36.0,75.0,209.0
Rooms,10000.0,,,,1.89,0.84,0.0,1.0,2.0,2.0,19.0
Square,10000.0,,,,56.3,21.1,1.14,41.8,52.5,65.9,641.0
LifeSquare,7890.0,,,,37.2,86.2,0.371,22.8,32.8,45.1,7480.0
KitchenSquare,10000.0,,,,6.27,28.6,0.0,1.0,6.0,9.0,2010.0
Floor,10000.0,,,,8.53,5.24,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,,,,12.6,6.78,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,,,,3990.0,201000.0,1910.0,1970.0,1980.0,2000.0,20100000.0
Ecology_1,10000.0,,,,0.119,0.119,0.0,0.0176,0.0754,0.196,0.522


In [5]:
train, valid = train_test_split(origin_train, test_size=0.3, random_state=42)

#### Функции для обработки и обучения

In [6]:
def PreProcess(df, source_df = train):
    
    source_df = source_df.copy()
    
    ## Кодировка района средним значением
    avg_price = source_df.groupby(['DistrictId'])['Price'].mean().reset_index()
    avg_price.columns = ['DistrictId','mean_price_district']
    df = pd.merge(df, avg_price, how = 'left', on = ['DistrictId'])
    
    # заполним пропуски в новой переменной среднийм значением
    df['mean_price_district'].fillna(value=train.Price.mean(), inplace = True)
    
    # заменим кол-во комнат большее 5 и 0 на медианное значение 
    ROOMS_MEDIAN = source_df['Rooms'].median()
    df.loc[(df['Rooms'] == 0 ) | (df['Rooms'] > 5) , 'Rooms'] = ROOMS_MEDIAN
    
    # Если площадь = 0 то заменим на среднее значение
    avg_square = source_df.groupby('Rooms')['Square'].mean().reset_index()
    avg_square.columns = ['Rooms','mean_square']
    df = pd.merge(df, avg_square, how = 'left', on = ['Rooms'])
    df.loc[(df['Square'] == 0), 'Square'] = df['mean_square']
    
    # Добавим новые переменные
    df['part_life_square']    = df.LifeSquare / df.Square
    df['part_kitchen_square'] = df.KitchenSquare / df.Square
    #df['square_per_rooms']    = df.Square / df.Rooms
    
    source_df['part_life_square']    = source_df.LifeSquare / source_df.Square
    source_df['part_kitchen_square'] = source_df.KitchenSquare / source_df.Square
    #source_df['square_per_rooms']    = source_df.Square / source_df.Rooms
    
    # Заполним пропуски в переменной доля жилой площади
    avg_part_life_square = source_df.groupby('Rooms')['part_life_square'].mean().reset_index()
    avg_part_life_square.columns = ['Rooms','mean_part_life_square']
    df = pd.merge(df, avg_part_life_square, how = 'left', on = ['Rooms'])
    df.loc[df['part_life_square'].isnull(), 'part_life_square'] = df['mean_part_life_square']
        
    # закодируем категориальные признаки
    df['Shops_2'] = df['Shops_2'].map({'A':1, 'B':0})
    df['Ecology_2'] = df['Ecology_2'].map({'A':1, 'B':0})
    df['Ecology_3'] = df['Ecology_3'].map({'A':1, 'B':0})
    
    #df['LifeSquare'].fillna(0, inplace = True)
    #df.loc[df['HouseFloor']<df['Floor'], 'HouseFloor'] = df['Floor']
    #df['last_floor'] = df['Floor'] == df['HouseFloor']
    
    return df
    

def fit_model(df, fts, target = 'Price'):
    
    ROW_COUNT = df.shape[0]
    df = df[fts + ['Price']].dropna()
    df = df[ (df['HouseYear']<= 2020) & \
             (df['part_kitchen_square'] < 1) & \
             (df['part_life_square'] < 1.5) ]    
    
    print(f'Из обучения удалнено {ROW_COUNT - df.shape[0]} строк')
    
    
    rf_model = RandomForestRegressor(  n_estimators= 500
                                     , min_samples_leaf=1
                                     , n_jobs=-1
                                     , random_state=42
                                     , max_features= 3 )
    
    rf_model.fit(df.loc[:, fts], df[target])
    pred_train_score = r2_score(df[target], rf_model.predict(df.loc[:, fts]))
    print(f'R2 on train = {pred_train_score}')   
          
    return rf_model


#### Подбор модели

In [7]:
# prepare data
train = PreProcess(train, train)
test  = PreProcess(valid, train)

# Список фичей для модели
fts = ['mean_price_district','Rooms','Square','part_life_square','part_kitchen_square','HouseYear'
      ,'Floor', 'HouseFloor','Shops_1','KitchenSquare','Ecology_1','Ecology_3'
      ,'Social_1', 'Social_2','Social_3','Helthcare_2','Shops_2']

# fit model
model = fit_model(train,fts ,target = 'Price')

# predict 
test_pred = model.predict(test[fts])

# Score
pred_test_score = r2_score(test['Price'], test_pred)
print(f'R2 on valid = {pred_test_score}')

Из обучения удалнено 11 строк
R2 on train = 0.9672559726924278
R2 on valid = 0.7303205312547274


In [8]:
pd.DataFrame({'imp':model.feature_importances_ , 'name':fts}).sort_values('imp')

Unnamed: 0,imp,name
16,0.001,Shops_2
11,0.002,Ecology_3
15,0.021,Helthcare_2
8,0.026,Shops_1
7,0.028,HouseFloor
10,0.028,Ecology_1
6,0.029,Floor
3,0.035,part_life_square
5,0.041,HouseYear
14,0.047,Social_3


#### Итоговый расчет 

In [9]:
# load dataset's
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

# prepare data
train = PreProcess(train, train)
test  = PreProcess(test , train)

# fit model
model = fit_model(train,fts ,target = 'Price')

# predict 
test_pred = model.predict(test[fts])

# save to file
result = pd.DataFrame({'Id':test['Id'].values, 'Price':test_pred})
result.to_csv('IUtkin_preditions.csv', index=False, )

Из обучения удалнено 16 строк
R2 on train = 0.966877012695406
