In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
matplotlib.rcParams.update({'font.size': 14})

#### Блок функций 

In [48]:
def output_unique_column_values(df, sign_list=[]):
    """
    Функция вывода уникальных значений признаков:
        df - Обязательный аргумент, перередавать DataFrame
        sign_list - Опцианальный аргумент, если аргумент не передается в функцию
            то выводяться уникальные значения всех признаком переданого датафрейма
        sign_list - В случаее передачи аргумента в виде строки str будут выведены
            уникальные значения данного признака в переданом датафрейме
        sign_list - В случае передачи списка признаков будут выведены уникальные значения 
            переданных признаков
    """
    
    if sign_list ==[]:
        sign_list = df.columns.tolist() 
        for sign in sign_list:
            print(f'{str(sign)}\n\n{str(df[sign].value_counts())}\n\n'
                  f'Length: {len(df[sign].value_counts())}\n{"*" * 100}\n')
    elif type(sign_list) is str: 
        print(f'{str(sign_list)}\n\n{str(df[sign_list].value_counts())}\n\n'
              f'Length: {len(df[sign_list].value_counts())}\n{"*" * 100}\n')
    else:
        for sign in sign_list:
            print(f'{str(sign)}\n\n{str(df[sign].value_counts())}\n\n'
                  f'Length: {len(df[sign].value_counts())}\n{"*" * 100}\n')
            
def quantile_data_cropping(df, sign_list, quantile_start=.01, quantile_stop=.99):
    """
    Функция обрезки данных по квантилям:
        df - Обязательный аргумент, перередавать DataFrame
        quantile_start - Опцианальный аргумент, начало квантили  default - quantile_start=.01, перередавать float 
        quantile_stop - Опцианальный аргумент, default - quantile_stop=.99, перередавать float 
        sign_list - Обязательный аргумент, [передать список признаков] данные будут скоректированы по переданным квантилям
    """
    for sign in sign_list:     
        df = df[df[sign].isnull() |
                (df[sign] < df[sign].quantile(quantile_stop)) &
                (df[sign] > df[sign].quantile(quantile_start))]
    return df
    


def preparation_signs_floor_housefloor(df, max_housefloor=55):
    """
    Функция подготовки признаков ['Floor'],['HouseFloor']:
        - type(df): pandas.core.frame.DataFrame
        - этаж (['Floor'] = 0) =1
        - этаж больше этажности дома ['Floor'] > ['HouseFloor']
        (поменять местами ['Floor'], ['HouseFloor'] = ['HouseFloor'], ['Floor'])
        - этажность дома (['HouseFloor'] = 0)|(['HouseFloor'] > 55) =['Floor']
        - этаж (['Floor'] > 55) =['HouseFloor']
        
    """
    df.loc[df.Floor==0,'Floor'] = 1
    id_row_0max_f = df.loc[(df.Floor > max_housefloor)].index .tolist()   
    df.loc[id_row_0max_f,'Floor'] = df.loc[id_row_0max_f,'HouseFloor']
    id_row_0max_hf = df.loc[(df.HouseFloor==0) | (df.HouseFloor > max_housefloor)].index.tolist() 
    df.loc[id_row_0max_hf,'HouseFloor'] = df.loc[id_row_0max_hf,'Floor']
    id_row_fhf = df.loc[df.Floor > df.HouseFloor].index.tolist() 
    df.loc[id_row_fhf,'Floor'], df.loc[id_row_fhf,'HouseFloor'] = df.loc[id_row_fhf,'HouseFloor'], df.loc[id_row_fhf,'Floor']    
    return df


def preparation_signs_lfsquare(df, min_ksq=10):
    """
    Функция подготовки признаков ['LifeSquare']:
    """
    df.loc[df.LifeSquare < min_ksq, 'LifeSquare'] = min_ksq
    square_med_diff = (df.loc[df.LifeSquare.notnull(), 'Square'] - \
                      df.loc[df.LifeSquare.notnull(), 'LifeSquare'] - \
                      df.loc[df.LifeSquare.notnull(), 'KitchenSquare']).median()
    df.loc[df.LifeSquare.isnull(), 'LifeSquare'] = (df.loc[df.LifeSquare.isnull(), 'Square'] - \
                                                   df.loc[df.LifeSquare.isnull(), 'KitchenSquare'] - \
                                                   square_med_diff)
    
    return df


def preparation_signs_healthcare_1(df):
    """
    Функция подготовки признаков ['Healthcare_1']:
    """
    dist_hc1 = df.groupby(['DistrictId'])[['Healthcare_1']].median()
    dist_hc1['Healthcare_1'] = dist_hc1['Healthcare_1'].fillna(dist_hc1['Healthcare_1'].mean())
    for idx in dist_hc1.index:
        df.loc[(pd.isna(df['Healthcare_1'])) & (df['DistrictId'] == idx),'Healthcare_1'] = df_hc_1.loc[idx,'Healthcare_1']
    return df




def preparation_signs_ksquare(df, min_ksq=3, max_ksq=25):
    df.loc[df.KitchenSquare > max_ksq, 'KitchenSquare'] = df.KitchenSquare.median()
    df.loc[df.KitchenSquare < min_ksq, 'KitchenSquare'] = min_ksq
    return df



def preparation_signs_rooms(df, max_rooms=5):
    """
    Функция подготовки признаков ['Rooms']:
        lsq_room - оношение жилой площади к количеству комнат
    """
    lsq_room = (df.LifeSquare/df.Rooms).mean()    
    id_row = df.loc[(df.Rooms == 0)|(df.Rooms > max_rooms)].index.tolist()   
    df.loc[id_row, 'Rooms'] = round(df.loc[id_row,'LifeSquare']/lsq_room)
    return df
 


### Блок загрузки и обзор описания данных

In [3]:
TRAIN = 'train.csv'
TEST = 'test.csv'

In [4]:
train_df = pd.read_csv(TRAIN)
train_df.tail()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
9995,77,32,2.0,50.401785,30.476203,5.0,6,5.0,1968,0.13565,B,B,46,7960,6,350.0,3,11,B,196684.31604
9996,6159,18,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.0,B,B,30,5562,0,,0,5,A,189050.289571
9997,5123,27,1.0,47.939008,,1.0,12,16.0,2015,0.072158,B,B,2,629,1,,0,0,A,159143.80537
9998,5400,75,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,B,A,30,5048,9,325.0,2,5,B,181595.339808
9999,6306,128,1.0,38.666645,21.157874,8.0,7,17.0,1990,0.0,B,B,27,4798,0,30.0,2,8,B,218714.077615


In [5]:
test_df = pd.read_csv(TEST)
test_df.tail()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
4995,8180,11,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.00017,B,B,36,5992,0,,1,1,B
4996,4695,1,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,B,B,1,264,0,,0,1,B
4997,5783,12,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,B,B,74,19083,2,,5,15,B
4998,4780,62,2.0,81.305222,,0.0,4,0.0,1977,0.072158,B,B,2,629,1,,0,0,A
4999,12504,30,2.0,60.555693,,1.0,10,17.0,1977,7.8e-05,B,B,22,6398,141,1046.0,3,23,B


In [6]:
train_df.shape[1] == test_df.shape[1]+1

True

### Обзор информации о данных
#### Приведение типов

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  object 
 11  Ecology_3      10000 non-null  object 
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

Замена типов данных признаков Id, DistrictId на str (категориальные данные) для данных из выборок train_df, test_df

In [8]:
train_df.Id = train_df.Id.astype(str)
train_df.DistrictId = train_df.DistrictId.astype(str)
test_df.Id = train_df.Id.astype(str)
test_df.DistrictId = train_df.DistrictId.astype(str)

In [9]:
train_df.dtypes.value_counts()

float64    8
int64      7
object     5
dtype: int64

In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             5000 non-null   object 
 1   DistrictId     5000 non-null   object 
 2   Rooms          5000 non-null   float64
 3   Square         5000 non-null   float64
 4   LifeSquare     3959 non-null   float64
 5   KitchenSquare  5000 non-null   float64
 6   Floor          5000 non-null   int64  
 7   HouseFloor     5000 non-null   float64
 8   HouseYear      5000 non-null   int64  
 9   Ecology_1      5000 non-null   float64
 10  Ecology_2      5000 non-null   object 
 11  Ecology_3      5000 non-null   object 
 12  Social_1       5000 non-null   int64  
 13  Social_2       5000 non-null   int64  
 14  Social_3       5000 non-null   int64  
 15  Healthcare_1   2623 non-null   float64
 16  Helthcare_2    5000 non-null   int64  
 17  Shops_1        5000 non-null   int64  
 18  Shops_2 

In [11]:
test_df.dtypes.value_counts()

int64      7
float64    7
object     5
dtype: int64

#### Описание датасета по уникальным значениям признаков
 - Идентификационный номер квартиры 
* Id, Length: 10000
 - Идентификационный номер района
* DistrictId, Length: 205 
 - Количество комнат
* Rooms, Length: 9 
 - Площадь 
* Square, Length: 10000 
 - Жилая площадь
* LifeSquare, Length: 7887 
 - Площадь кухни
* KitchenSquare, Length: 58 
 - Этаж
* Floor, Length: 33 
 - Количество этажей в доме
* HouseFloor, Length: 44 
 - Год постройки дома
* HouseYear, Length: 97 
 - Экологические показатели местности
*     Ecology_1, Length: 129
*     Ecology_2, Length: 2
*     Ecology_3, Length: 2
 - Социальные показатели местности
*     Social_1, Length: 51
*     Social_2, Length: 142
*     Social_3, Length: 30
 - Показатели местности, связанные с охраной здоровья
*     Healthcare_1, Length: 79
*     Helthcare_2, Length: 7
 - Показатели, связанные с наличием магазинов, торговых центров
* Shops_1, Length: 16
* Shops_2, Length: 2
 - Цена квартиры
* Price, Length: 10000 

In [12]:
# output_unique_column_values(train_df)

### Подготовка данных

Обработка признака ['HouseYear']: 'Год постройки дома'

In [13]:
train_df.describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [14]:
test_df.describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,1.91,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428
std,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365
min,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


Скоректировать данные по квантилям (.01, .99)

In [15]:
train_df = quantile_data_cropping(train_df, ['Square', 'LifeSquare', 'KitchenSquare'])
test_df = quantile_data_cropping(test_df, ['Square', 'LifeSquare', 'KitchenSquare'])

In [16]:
train_df.describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,8881.0,8881.0,7296.0,8881.0,8881.0,8881.0,8881.0,8881.0,8881.0,8881.0,8881.0,4799.0,8881.0,8881.0,8881.0
mean,1.888076,54.809487,34.830304,6.075329,8.446684,12.826934,4242.532,0.11962,26.071163,5630.480014,8.403783,1156.072932,1.392636,4.438577,214939.479049
std,0.826691,16.225349,14.280535,3.503611,5.133763,6.369206,212757.1,0.119654,17.269698,3977.90281,24.295214,1032.814421,1.492092,4.83734,89654.209673
min,0.0,29.719934,2.941437,1.0,1.0,0.0,1914.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,1.0,41.637555,22.863805,1.0,4.0,9.0,1972.0,0.017647,10.0,2370.0,0.0,325.0,0.0,1.0,156934.440088
50%,2.0,51.656012,32.459931,6.0,7.0,14.0,1977.0,0.075779,25.0,5562.0,2.0,990.0,1.0,3.0,194240.534503
75%,2.0,64.720916,44.119769,9.0,12.0,17.0,2000.0,0.194489,36.0,7614.0,5.0,1548.0,3.0,6.0,250381.58078
max,19.0,117.855969,88.381356,14.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [17]:
test_df.describe()

Unnamed: 0,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,4447.0,4447.0,3652.0,4447.0,4447.0,4447.0,4447.0,4447.0,4447.0,4447.0,4447.0,2408.0,4447.0,4447.0
mean,1.908702,55.150882,35.040202,6.063638,8.554981,12.777828,1984.030358,0.120233,26.145042,5644.405892,8.627839,1161.46304,1.389926,4.431527
std,0.822662,16.344318,14.377073,3.431909,5.405931,6.336609,18.520131,0.119908,17.301028,3998.0224,24.352123,1053.126231,1.481773,4.811164
min,0.0,30.062518,2.819415,1.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,1.0,41.77492,23.190911,4.0,4.0,9.0,1972.0,0.019509,11.0,2370.0,0.0,325.0,0.0,1.0
50%,2.0,52.034775,32.631811,6.0,7.0,12.0,1977.0,0.075779,25.0,5562.0,2.0,900.0,1.0,3.0
75%,2.0,65.322366,44.306318,9.0,12.0,17.0,1999.0,0.195781,37.0,7614.0,5.0,1575.0,3.0,6.0
max,17.0,115.008223,85.621397,13.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [18]:
train_df.loc[(train_df.HouseYear > 2020)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1497,10814,109,1.0,37.26507,20.239714,9.0,9,12.0,20052011,0.13633,B,B,30,6141,10,262.0,3,6,B,254084.534396
4189,11607,147,2.0,44.791836,28.360393,5.0,4,9.0,4968,0.319809,B,B,25,4756,16,2857.0,5,8,B,243028.603096


In [19]:
train_df.loc[1497, 'HouseYear'] = 2011
train_df.loc[4189, 'HouseYear'] = 1968
train_df.loc[[1497, 4189]]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1497,10814,109,1.0,37.26507,20.239714,9.0,9,12.0,2011,0.13633,B,B,30,6141,10,262.0,3,6,B,254084.534396
4189,11607,147,2.0,44.791836,28.360393,5.0,4,9.0,1968,0.319809,B,B,25,4756,16,2857.0,5,8,B,243028.603096


In [20]:
test_df.loc[(test_df.HouseYear > 2020)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2


Обработка признака ['HouseFloor'], ['Floor']: 'Количество этажей в доме', 'Этаж'

In [21]:
train_df.loc[train_df.Floor > train_df.HouseFloor]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
14,10953,27,1.0,53.769097,52.408027,1.0,5,4.0,1977,0.072158,B,B,2,629,1,,0,0,A,140647.565937
16,2119,27,2.0,49.360648,31.993964,5.0,6,5.0,1983,0.051815,B,B,5,1227,0,,0,0,B,117000.381287
21,11935,27,2.0,64.711835,,1.0,15,1.0,1977,0.211401,B,B,9,1892,0,,0,1,B,127200.026511
35,6486,200,3.0,85.280389,58.447967,9.0,6,5.0,1960,0.000000,B,B,33,7425,1,,2,5,B,402871.916317
51,10103,94,1.0,35.280894,23.354176,6.0,11,9.0,1971,0.282798,B,B,33,8667,2,,0,6,B,148862.210174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9984,2237,52,3.0,64.074977,47.633428,6.0,11,9.0,1983,0.371149,B,B,34,7065,1,750.0,2,5,B,226434.880156
9985,16159,27,1.0,38.968338,,1.0,10,1.0,1977,0.211401,B,B,9,1892,0,,0,1,B,95529.569405
9989,4145,96,3.0,90.055233,47.860231,11.0,26,24.0,2005,0.041116,B,B,53,14892,4,,1,4,B,441108.911237
9991,11553,3,3.0,101.622794,51.628780,12.0,18,17.0,2000,0.265089,B,B,37,5288,0,1937.0,3,2,B,212883.501504


In [22]:
test_df.loc[test_df.Floor > test_df.HouseFloor]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
1,15053,41,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
10,15370,28,3.0,64.543658,42.651541,7.0,10,9.0,1981,0.300323,B,B,52,10311,6,,1,9,B
14,10953,27,1.0,37.555197,,1.0,5,1.0,1977,0.211401,B,B,9,1892,0,,0,1,B
27,7550,23,1.0,48.610661,48.752502,1.0,4,3.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B
36,11703,23,1.0,37.480811,16.851795,8.0,6,5.0,1933,0.169091,B,B,19,3856,10,,2,5,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4968,9544,46,4.0,104.664209,66.903563,1.0,4,1.0,2016,0.017647,B,B,2,469,0,,0,0,B
4972,13436,47,1.0,38.376234,37.003311,1.0,2,1.0,1977,0.000078,B,B,22,6398,141,1046.0,3,23,B
4989,8861,61,2.0,44.203592,30.191691,6.0,10,9.0,1972,0.428826,B,B,25,5011,8,730.0,0,2,B
4993,7726,41,2.0,46.627882,27.058739,5.0,6,5.0,1969,0.307467,B,A,30,5048,9,325.0,2,5,B


In [23]:
train_df.loc[(train_df.HouseFloor==0) | (train_df.HouseFloor > 55)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1724,16033,27,2.0,62.794699,36.25991,11.0,9,0.0,1977,0.017647,B,B,2,469,0,,0,0,B,187244.010969
1788,684,27,1.0,39.225233,18.97195,11.0,5,0.0,1977,0.017647,B,B,2,469,0,,0,0,B,139470.380358
2205,11960,34,1.0,38.232839,35.143077,10.0,19,0.0,1977,0.069753,B,B,53,13670,4,,1,11,B,190443.723654
2206,8417,27,2.0,60.452265,36.626949,10.0,5,0.0,1977,0.072158,B,B,2,629,1,,0,0,A,210750.386216
2566,15333,88,2.0,66.328163,,11.0,12,0.0,1977,0.127376,B,B,43,8429,3,,3,9,B,290791.329939
2763,9947,23,1.0,29.879036,31.264387,1.0,12,0.0,1977,0.034656,B,B,0,168,0,,0,0,B,67898.291266
4133,630,6,1.0,40.953789,42.562967,1.0,11,0.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B,118505.473583
5118,9256,74,3.0,113.250848,,1.0,14,0.0,1977,0.075779,B,B,6,1437,3,,0,2,B,372244.403093
5222,4644,27,1.0,45.970814,,9.0,7,0.0,1977,0.072158,B,B,2,629,1,,0,0,A,167988.155442
5730,2417,6,3.0,79.643412,,12.0,8,0.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B,224767.945373


In [24]:
test_df.loc[(test_df.HouseFloor==0) | (test_df.HouseFloor > 55)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
320,11173,6,1.0,45.109346,38.510017,11.0,13,0.0,1977,0.017647,B,B,2,469,0,,0,0,B
383,5085,111,1.0,47.769551,,12.0,11,0.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B
481,11467,119,3.0,76.729191,,1.0,16,0.0,1977,0.007122,B,B,1,264,0,,0,1,B
651,6988,9,3.0,47.722835,47.098813,9.0,18,99.0,1977,0.072158,B,B,2,629,1,,0,0,A
966,8571,31,2.0,54.209252,31.161955,8.0,6,0.0,1998,0.092291,B,B,21,4346,2,165.0,1,2,B
1602,7518,84,3.0,89.360253,52.70285,11.0,13,0.0,1977,0.072158,B,B,2,629,1,,0,0,A
2524,6725,134,0.0,76.345154,42.820796,12.0,14,0.0,1977,0.017647,B,B,2,469,0,,0,0,B
2555,13812,1,1.0,38.658713,41.832877,1.0,5,0.0,1977,0.007122,B,B,1,264,0,,0,1,B
3041,14022,41,1.0,49.112916,,1.0,5,0.0,1977,0.007122,B,B,1,264,0,,0,1,B
3530,3273,89,2.0,64.437824,,7.0,8,0.0,1977,0.017647,B,B,2,469,0,,0,0,B


In [25]:
train_df.loc[(train_df.Floor ==0)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price


In [26]:
train_df.loc[(train_df.Floor > 55)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price


In [27]:
test_df.loc[(test_df.Floor > 55)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
4698,14330,138,2.0,57.60187,37.744743,10.0,78,22.0,1989,0.0,B,B,25,5027,4,46.0,1,1,B


In [28]:
train_df = preparation_signs_floor_housefloor(train_df)
test_df = preparation_signs_floor_housefloor(test_df)

Обработка признака ['KitchenSquare'], ['LifeSquare']: 'Площадь кухни', 'Жилая площадь'

In [29]:
print(f'train_df.LifeSquare.median: {train_df.LifeSquare.median()}\n'
      f'test_df.LifeSquare.median: {test_df.LifeSquare.median()}\n'
      f'train_df.KitchenSquare.median: {train_df.KitchenSquare.median()}\n'
      f'test_df.KitchenSquare.median: {test_df.KitchenSquare.median()}')

train_df.LifeSquare.median: 32.459930868446605
test_df.LifeSquare.median: 32.631810662086764
train_df.KitchenSquare.median: 6.0
test_df.KitchenSquare.median: 6.0


In [30]:
train_df = preparation_signs_ksquare(train_df)
train_df = preparation_signs_lfsquare(train_df)

In [31]:
test_df = preparation_signs_ksquare(test_df)
ptest_df = preparation_signs_lfsquare(test_df)

In [32]:
print(f'train_df.LifeSquare.median: {train_df.LifeSquare.median()}\n'
      f'test_df.LifeSquare.median: {test_df.LifeSquare.median()}\n'
      f'train_df.KitchenSquare.median: {train_df.KitchenSquare.median()}\n'
      f'test_df.KitchenSquare.median: {test_df.KitchenSquare.median()}')

train_df.LifeSquare.median: 33.273699979715275
test_df.LifeSquare.median: 33.587619525656585
train_df.KitchenSquare.median: 6.0
test_df.KitchenSquare.median: 6.0


Обработка признака ['Rooms'], ['Square']: 'Количество комнат', 'Площадь '

In [44]:
train_df.loc[(train_df.Rooms == 0)|(train_df.Rooms > 5)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
377,5927,57,0.0,59.056975,36.223072,10.0,22.0,22.0,2002,0.090799,B,B,74,19083,2,,5,15,B,317265.323792
1454,8491,1,0.0,42.006046,21.779288,7.0,17.0,17.0,2014,0.007122,B,B,1,264,0,,0,1,B,78364.616704
2170,14003,99,0.0,59.414334,38.702244,6.0,7.0,9.0,1969,0.033494,B,B,66,10573,1,1322.0,3,8,B,229661.964416
6149,3159,88,0.0,38.697117,19.345131,9.0,9.0,16.0,1982,0.127376,B,B,43,8429,3,,3,9,B,158998.110646
8849,14865,9,0.0,60.871266,38.420681,10.0,2.0,3.0,1994,0.161532,B,B,25,5648,1,30.0,2,4,B,172329.270863


In [50]:
train_df = preparation_signs_rooms(train_df)

In [51]:
train_df.loc[(train_df.Rooms == 0)|(train_df.Rooms > 5)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
377,5927,57,0.0,59.056975,36.223072,10.0,22.0,22.0,2002,0.090799,B,B,74,19083,2,,5,15,B,317265.323792
1454,8491,1,0.0,42.006046,21.779288,7.0,17.0,17.0,2014,0.007122,B,B,1,264,0,,0,1,B,78364.616704
2170,14003,99,0.0,59.414334,38.702244,6.0,7.0,9.0,1969,0.033494,B,B,66,10573,1,1322.0,3,8,B,229661.964416
6149,3159,88,0.0,38.697117,19.345131,9.0,9.0,16.0,1982,0.127376,B,B,43,8429,3,,3,9,B,158998.110646
8849,14865,9,0.0,60.871266,38.420681,10.0,2.0,3.0,1994,0.161532,B,B,25,5648,1,30.0,2,4,B,172329.270863


In [52]:
test_df.loc[(test_df.Rooms == 0)|(test_df.Rooms > 5)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
2524,6725,134,0.0,76.345154,42.820796,12.0,14.0,14.0,1977,0.017647,B,B,2,469,0,,0,0,B
3398,8371,57,0.0,52.866107,32.528342,8.0,15.0,17.0,1987,0.093443,B,B,23,4635,5,3300.0,2,4,B


In [53]:
test_df = preparation_signs_rooms(test_df)

In [54]:
test_df.loc[(test_df.Rooms == 0)|(test_df.Rooms > 5)]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
2524,6725,134,0.0,76.345154,42.820796,12.0,14.0,14.0,1977,0.017647,B,B,2,469,0,,0,0,B
3398,8371,57,0.0,52.866107,32.528342,8.0,15.0,17.0,1987,0.093443,B,B,23,4635,5,3300.0,2,4,B


Создание дополнительных признаков

#### Создание новых признаков на основе признаков с одинаковым количеством уникальных значений

Ecology_2, Ecology_3, Shops_2 --> Eco2_Eco3_Shops2 
с заменой категорий: 'B','A' = 0, 1

In [None]:
output_unique_column_values(train_df, ['Ecology_2', 'Ecology_3', 'Shops_2'])

In [None]:
train_df['Ecology_2_bin'] = train_df['Ecology_2'].replace({'A':bin(1), 'B':bin(0)})
train_df['Ecology_3_bin'] = train_df['Ecology_3'].replace({'A':bin(1), 'B':bin(0)})
train_df['Shops_2_bin'] = train_df['Shops_2'].replace({'A':bin(1), 'B':bin(0)})

In [None]:
train_df["Eco2_Eco3_Shops2"] = train_df.Ecology_2 + train_df.Ecology_3 + train_df.Shops_2
train_df["Eco2_Eco3_Shops2_bin"] = train_df.Ecology_2_bin + train_df.Ecology_3_bin + train_df.Shops_2_bin

In [None]:
output_unique_column_values(train_df, sign_list="Eco2_Eco3_Shops2")

In [None]:
output_unique_column_values(train_df, sign_list="Eco2_Eco3_Shops2_bin")

In [None]:
train_df.tail()

### модели

In [None]:
X = train_df.drop('Price', axis=1)
y = pd.DataFrame(train_df.Price)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)