# Черновик - содержит различные варианты

In [767]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 100

### 1) Читаем данные

In [768]:
data = pd.read_csv('input/train.csv')

In [769]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [770]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


### 2) Разбиваем на трейн и валид

In [771]:
from sklearn.model_selection import train_test_split

In [772]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [773]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price'],
      dtype='object')

### 3) Пишем функции для обработки данных

In [774]:
def clean_kitchen(df):
    df.loc[df['KitchenSquare'] > df['Square'] * 0.4, 'KitchenSquare'] = df['Square'] * 0.15
    return df

In [775]:
def clean_rooms(df):
    df.loc[df['Rooms'] > 5, 'Rooms'] = 5
    return df

In [776]:
def clean_square(df):
    df.loc[df['Square'] < 15, 'Square'] = 15
    df['LifeSquare'] = df['LifeSquare'].fillna(df['Square'] - df['KitchenSquare'])
    df.loc[df['Square'] < df['LifeSquare'] + df['KitchenSquare'], 'Square'] = df['LifeSquare'] + df['KitchenSquare']
    return df

In [777]:
def clean_year(df):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = 2020
    return df

In [778]:
stat0 = train.groupby(['DistrictId', 'Rooms', 'Social_2'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_drs'})
stat1 = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_dr'})
stat2 = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})
mean_price = train['Price'].mean()

In [779]:
def add_mean_price(df, stat0, stat1, stat2, mean_price):
    df = pd.merge(df, stat0, on=['DistrictId', 'Rooms', 'Social_2'], how='left')
    df = pd.merge(df, stat1, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, stat2, on=['Rooms'], how='left')
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    df['mean_price_drs'] = df['mean_price_drs'].fillna(df['mean_price_dr'])
    return df

In [780]:
def do_label(df, feature):
    lab_enc = preprocessing.LabelEncoder()
    df.loc[:, feature] = lab_enc.fit_transform(df.loc[:, feature])
    return df
def dummies_handling(df):
    df = pd.get_dummies(df, columns = ['Ecology_2', 'Ecology_3', 'Shops_2'])
    #df = do_label(df, 'Social_2')
    return df

In [781]:
def floors_checking(df):
    df['HouseFloor'] = df['HouseFloor'].astype(int)
    df.loc[df['HouseFloor'] <= 0, 'HouseFloor'] = df['Floor'] + 1 # Не на последнем
    to_change = df['Floor'] > df['HouseFloor']
    df.loc[to_change, 'Floor'], df.loc[to_change, 'HouseFloor'] = df.loc[to_change, 'HouseFloor'], df.loc[to_change,'Floor']
    df['LastFloor'] = 0
    df.loc[df['Floor'] == df['HouseFloor'], 'LastFloor'] = 1
    df.loc[df['Floor'] == 1, 'LastFloor'] = 1
    return df

In [782]:
hc_stat1 = train.groupby(['DistrictId'], as_index=False)[['Healthcare_1']].mean().rename(columns={'Healthcare_1':'mean_hc_1'})
mean_hc_1 = train['Healthcare_1'].mean()

In [783]:
def clean_healthcare(df, hc_stat1, mean_hc_1):
    df['Helthcare_2'] = df['Helthcare_2'].astype(int)
    df = pd.merge(df, hc_stat1, on=['DistrictId'], how='left')
    df['mean_hc_1'] = df['mean_hc_1'].fillna(mean_hc_1)
    df['Healthcare_1'] = df['Healthcare_1'].fillna(df['mean_hc_1'])    
    return df

In [784]:
def prepare_data(df, stat1=stat1, stat2=stat2, mean_price=mean_price, hc_stat1=hc_stat1, mean_hc_1=mean_hc_1):
    df = clean_kitchen(df)
    df = clean_rooms(df)
    df = clean_square(df)
    df = clean_year(df)
    df = dummies_handling(df)
    df = add_mean_price(df, stat0, stat1, stat2, mean_price)
    df = floors_checking(df)
    df = clean_healthcare(df, hc_stat1, mean_hc_1)
    return df

### 4) Чистка данных

Я решил не удалять строки - всё равно из Теста их нельзя удалять

### 5) Применяем функции на трейне и валиде

In [785]:
train = prepare_data(train)
valid = prepare_data(valid)

In [766]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,mean_price_drs,mean_price_dr,mean_price_r,LastFloor
0,14604,23,1.0,41.68138,22.796166,8.0,14,17,2015,0.075779,6,1437,3,,0,2,88504.384965,0,1,0,1,0,1,116016.381624,102427.030975,160134.810901,0
1,5621,23,3.0,173.504222,161.504222,12.0,3,5,1977,0.014073,2,475,0,,0,0,207007.956663,0,1,0,1,0,1,185940.043115,165911.1297,290867.452543,0
2,235,87,1.0,39.710131,19.538663,8.0,4,17,1986,0.100456,43,7227,0,,1,6,182126.280899,0,1,0,1,1,0,169596.630515,169596.630515,160134.810901,0
3,16258,48,3.0,99.152802,98.152802,1.0,1,15,2017,0.041125,46,9515,5,,1,10,524365.550705,0,1,0,1,0,1,382424.639356,382424.639356,290867.452543,1
4,10773,77,3.0,79.195825,44.421062,10.0,16,17,1984,0.298205,16,4048,3,,1,3,322048.43399,0,1,0,1,0,1,322048.43399,251751.766701,290867.452543,0


In [600]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 0 to 6999
Data columns (total 27 columns):
Id                7000 non-null int64
DistrictId        7000 non-null int64
Rooms             7000 non-null float64
Square            7000 non-null float64
LifeSquare        7000 non-null float64
KitchenSquare     7000 non-null float64
Floor             7000 non-null int64
HouseFloor        7000 non-null int64
HouseYear         7000 non-null int64
Ecology_1         7000 non-null float64
Social_1          7000 non-null int64
Social_2          7000 non-null int64
Social_3          7000 non-null int64
Healthcare_1      3642 non-null float64
Helthcare_2       7000 non-null int32
Shops_1           7000 non-null int64
Price             7000 non-null float64
Ecology_2_A       7000 non-null uint8
Ecology_2_B       7000 non-null uint8
Ecology_3_A       7000 non-null uint8
Ecology_3_B       7000 non-null uint8
Shops_2_A         7000 non-null uint8
Shops_2_B         7000 non-null uint8
mean_

### 6) Строим модель, делаем предсказание на трейне и валиде

In [786]:
from sklearn.ensemble import RandomForestRegressor as RF

In [787]:
model = RF(n_estimators=250, max_depth=10, random_state=42)

In [788]:
feats = ['Rooms', 'Square', 'HouseYear', 'mean_price_drs']

In [789]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

### 7) Проверяем качество модели

In [790]:
from sklearn.metrics import r2_score

In [791]:
def pred_evaluate_model(df, model=model, feats=feats, calculate_r2='Yes'):
    pred = model.predict(df.loc[:, feats])
    if calculate_r2 == 'Yes':
        r2 = r2_score(df['Price'], pred)
        print(f'R2: {r2}')
    return pred

In [792]:
def evaluate_model(df, model=model, feats=feats):
    pred = model.predict(df.loc[:, feats])
    return r2_score(df['Price'], pred)

In [793]:
pred_train = pred_evaluate_model(train)

R2: 0.892663836745316


In [794]:
pred_valid = pred_evaluate_model(valid)

R2: 0.6407381725746282


### 7.5) Исследование, что лучше подойдёт

#### Линейная регрессия

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [30]:
def check_regression(feats, train=train, valid=valid, do_scalar=True):
    lr = LinearRegression()
    scaler = StandardScaler()
    X_train = train.loc[:, feats]
    y_train = train['Price']
    X_valid = valid.loc[:, feats]
    y_valid = valid['Price']
    if do_scalar:
        X_train = pd.DataFrame(data=scaler.fit_transform(X_train), columns=feats)
        X_valid = pd.DataFrame(data=scaler.transform(X_valid), columns=feats)
    lr.fit(X_train, y_train)
    X_train['Price'] = y_train
    X_valid['Price'] = y_valid
    return f"{evaluate_model(X_train, model=lr, feats=feats)}" \
           f"\t{evaluate_model(X_valid, model=lr, feats=feats)}" \
           f"\t{feats}\t{do_scalar}"

#### Случайный лес

In [795]:
def check_forest(feats, n_estimators, max_depth, train=train, valid=valid):
    model = RF(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(train.loc[:, feats], train['Price'])
    return f"{evaluate_model(train, model=model, feats=feats)}" \
           f"\t{evaluate_model(valid, model=model, feats=feats)}" \
           f"\t{n_estimators}\t{max_depth}"

In [808]:
def check_forests(feats, train=train, valid=valid, est_s=5, est_e=50, \
                  est_step=5, depth_s=5, depth_e=50, depth_step=5, random_state=42):
    for i in range(est_s, est_e, est_step):
        for j in range(depth_s, depth_e, depth_step):
            try:
                model = RF(n_estimators=i, max_depth=j, random_state=random_state)
                model.fit(train.loc[:, feats], train['Price'])
                print(f"{evaluate_model(train, model=model, feats=feats)}" \
                      f"\t{evaluate_model(valid, model=model, feats=feats)}" \
                      f"\t{i}\t{j}")
            except:
                print(f"Cant\t{i}\t{j}")    

#### Случайный лес с подбором фич

In [797]:
feats = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', \
         'HouseYear', 'Social_1', 'Ecology_2_B', 'Ecology_3_A', 'Shops_2_B', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=275, est_e=330, depth_s=11, depth_e=13, depth_step=1)

Excel RFIter

In [None]:
check_forests(feats, est_s=325, est_e=375, depth_s=11, depth_e=13, depth_step=1)

Excel RFIter (added below and sorted)

In [None]:
check_forests(feats, est_s=338, est_e=343, est_step=1, depth_s=12, depth_e=14, depth_step=1)

Excel RFIter (added below and sorted)

In [None]:
check_forests(feats, est_s=341, est_e=346, est_step=1, depth_s=12, depth_e=18, depth_step=1)

Excel RFIter (added below and sorted)

In [98]:
check_forests(feats, est_s=345, est_e=375, est_step=5, depth_s=15, depth_e=16, depth_step=1)

0.9531728110398567	0.6805549349908417	345	15
0.9531752881920362	0.6804439376253606	350	15
0.9531524511138972	0.6803394084849284	355	15
0.9531621562685387	0.6803443987485123	360	15
0.9531458671186916	0.6805329306402912	365	15
0.9530667472602377	0.6804382260325099	370	15


Excel RFIter (added below and sorted)

In [99]:
feats = ['DistrictId', 'Rooms', 'Square', 'KitchenSquare', \
         'Social_1', 'Ecology_3_A', 'Shops_2_B', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=300, est_e=375, est_step=2, depth_s=11, depth_e=18, depth_step=1)

Excel RFIter2

In [None]:
check_forests(feats, est_s=100, est_e=325, est_step=25, depth_s=11, depth_e=14, depth_step=1)

Excel RFIter2 (added below and sorted)

In [None]:
check_forests(feats, est_s=25, est_e=375, est_step=25, depth_s=4, depth_e=14, depth_step=2)

Excel RFIter2 (added below and sorted)

In [None]:
check_forests(feats, est_s=25, est_e=375, est_step=25, depth_s=8, depth_e=17, depth_step=1)

Excel RFIter2 (added below and sorted)

In [None]:
check_forests(feats, est_s=125, est_e=175, est_step=2, depth_s=10, depth_e=13, depth_step=1)

Excel RFIter2 (added below and sorted)

In [106]:
feats = ['DistrictId', 'Rooms', 'Square', 'KitchenSquare', \
         'Social_2', 'Ecology_3_A', 'Shops_1', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=25, est_e=375, est_step=10, depth_s=8, depth_e=17, depth_step=1)

Excel RFIter3

In [108]:
feats = ['Square', 'KitchenSquare', \
         'Social_2', 'Ecology_3_A', 'Shops_1', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=25, est_e=375, est_step=25, depth_s=8, depth_e=17, depth_step=2)

Excel RFIter4

In [110]:
feats = ['Square', 'KitchenSquare', 'Helthcare_2', \
         'Social_2', 'Ecology_3_A', 'Shops_1', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=25, est_e=375, est_step=25, depth_s=8, depth_e=17, depth_step=2)

Excel RFIter4

In [137]:
train['Helthcare_2'] = train['Helthcare_2'].astype(int)
valid['Helthcare_2'] = valid['Helthcare_2'].astype(int)

In [142]:
feats = ['DistrictId', 'Rooms', 'Floor', 'HouseFloor', 'Square', 'KitchenSquare', 'Helthcare_2',\
         'Social_2', 'Ecology_3_A', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=25, est_e=375, est_step=25, depth_s=8, depth_e=17, depth_step=2)

Excel RFIter5

In [146]:
feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1',\
         'Social_1', 'Social_2', 'Ecology_3_A', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=25, est_e=375, est_step=25, depth_s=8, depth_e=17, depth_step=2)

Excel RFIter6

In [433]:
feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1',\
         'Social_1', 'Social_2', 'Ecology_3_A', 'LastFloor', 'mean_price_dr']

In [None]:
check_forests(feats, est_s=146, est_e=155, est_step=1, depth_s=7, depth_e=11, depth_step=1)

Excel RFIter7

In [437]:
feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1',\
         'Social_1', 'Social_2', 'Ecology_3_A', 'LastFloor', 'mean_price_dr', 'KitchenSquare']

In [None]:
check_forests(feats, est_s=148, est_e=153, est_step=1, depth_s=9, depth_e=15, depth_step=1)

Excel RFIter8

In [798]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 0 to 6999
Data columns (total 28 columns):
Id                7000 non-null int64
DistrictId        7000 non-null int64
Rooms             7000 non-null float64
Square            7000 non-null float64
LifeSquare        7000 non-null float64
KitchenSquare     7000 non-null float64
Floor             7000 non-null int64
HouseFloor        7000 non-null int64
HouseYear         7000 non-null int64
Ecology_1         7000 non-null float64
Social_1          7000 non-null int64
Social_2          7000 non-null int64
Social_3          7000 non-null int64
Healthcare_1      7000 non-null float64
Helthcare_2       7000 non-null int32
Shops_1           7000 non-null int64
Price             7000 non-null float64
Ecology_2_A       7000 non-null uint8
Ecology_2_B       7000 non-null uint8
Ecology_3_A       7000 non-null uint8
Ecology_3_B       7000 non-null uint8
Shops_2_A         7000 non-null uint8
Shops_2_B         7000 non-null uint8
mean_

In [835]:
feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_A',\
         'Social_1', 'Social_3', 'Ecology_1',\
         'KitchenSquare', 'HouseFloor', 'Floor', 'HouseYear', 'Healthcare_1']

In [None]:
check_forests(feats, est_s=175, est_e=276, est_step=25, depth_s=11, depth_e=16, depth_step=1)

Excel RFIter9

In [None]:
check_forests(feats, est_s=175, est_e=225, est_step=5, depth_s=15, depth_e=18, depth_step=1)

Excel RFIter9

In [None]:
check_forests(feats, est_s=214, est_e=217, est_step=1, depth_s=17, depth_e=19, depth_step=1)

Excel RFIter9

#### Ручной подбор фич

feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_A',\
         'Social_1', 'Social_3', 'Ecology_1',\
         'KitchenSquare', 'HouseFloor', 'Floor', 'HouseYear', 'Healthcare_1']

In [836]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8734208878058171	0.713591705977626	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_A',\
         'Social_1', 'Social_3', 'Ecology_1', 'DistrictId',\
         'KitchenSquare', 'HouseFloor', 'Floor', 'HouseYear', 'Healthcare_1']

In [824]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8789640025604452	0.7134197255553563	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1', 'DistrictId', 'Social_3',\
         'KitchenSquare', 'HouseFloor', 'Floor', 'HouseYear', 'Healthcare_1']

In [800]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8813056049408263	0.7134198205050545	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1', 'DistrictId', 'Social_3',\
         'KitchenSquare', 'HouseFloor', 'Floor', 'HouseYear']

In [524]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8816980999545266	0.710457715214928	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1', 'DistrictId',\
         'KitchenSquare', 'HouseFloor', 'Floor', 'HouseYear']

In [522]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8662344089565438	0.6961740933000424	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1',\
         'KitchenSquare', 'HouseFloor', 'Floor', 'HouseYear']

In [518]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8513517738282208	0.693852472161564	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1',\
         'KitchenSquare', 'HouseFloor', 'Floor']

In [516]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8492888951425899	0.6912579590926415	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1',\
         'KitchenSquare', 'HouseFloor']

In [510]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8471006566800421	0.6908330804932807	150	11


feats = ['Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1',\
         'KitchenSquare']

In [504]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8442271978116513	0.688933380618957	150	11


feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'Ecology_1',\
         'KitchenSquare']

In [502]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.856922951759735	0.6870310827791447	150	11


feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'LastFloor', 'Ecology_1',\
         'KitchenSquare']

In [500]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8575149055570301	0.6862194160166801	150	11


feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1', 'Shops_2_B',\
         'Social_1', 'Social_2', 'LastFloor', 'Ecology_1',\
         'mean_price_dr', 'KitchenSquare']

In [490]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.9004239803768026	0.6786840246044776	150	11


feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1',\
         'Social_1', 'Social_2', 'LastFloor', 'Ecology_1',\
         'mean_price_dr', 'KitchenSquare']

In [470]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.9003135920111034	0.6786419691139831	150	11


feats = ['DistrictId', 'Rooms', 'Square', 'Helthcare_2', 'Shops_1',\
         'Social_1', 'Social_2', 'LastFloor', 'Ecology_3_B', 'Ecology_2_B', 'Ecology_1',\
         'mean_price_dr', 'KitchenSquare']

In [468]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.9001535612485546	0.677772636674417	150	11


In [460]:
check_forests(feats, est_s=150, est_e=151, est_step=1, depth_s=11, depth_e=12, depth_step=1)

0.8993802647178427	0.6767204545801306	150	11


In [None]:
check_forests(feats, est_s=148, est_e=153, est_step=1, depth_s=9, depth_e=15, depth_step=1)

In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [58]:
parameters = [{'n_estimators': [150, 200, 250], 
               'max_features': np.arange(5, 9),
               'max_depth': np.arange(5, 10)}]

In [66]:
clf = GridSearchCV(estimator=RF(random_state=100), 
                   param_grid=parameters,
                   cv=5)

In [67]:
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(train.loc[:, 'Price'])

In [68]:
clf.fit(train.drop(columns='Price'), training_scores_encoded)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=100, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [150, 200, 250], 'max_features': array([5, 6, 7, 8]), 'max_depth': array([5, 6, 7, 8, 9])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [69]:
clf.best_params_

{'max_depth': 9, 'max_features': 8, 'n_estimators': 250}

In [70]:
parameters = [{'n_estimators': [250, 300, 350], 
               'max_features': np.arange(8, 11),
               'max_depth': np.arange(9, 12)}]

In [73]:
clf2 = GridSearchCV(estimator=RF(random_state=100), 
                   param_grid=parameters,
                   cv=5)

In [74]:
clf2.fit(train.drop(columns='Price'), training_scores_encoded)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=100, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [250, 300, 350], 'max_features': array([ 8,  9, 10]), 'max_depth': array([ 9, 10, 11])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [75]:
clf2.best_params_

{'max_depth': 11, 'max_features': 9, 'n_estimators': 300}

In [78]:
parameters3 = [{'n_estimators': [275, 300, 325], 
               'max_features': [9],
               'max_depth': np.arange(11, 15)}]

In [79]:
clf3 = GridSearchCV(estimator=RF(random_state=100), 
                   param_grid=parameters3,
                   cv=5)

In [80]:
clf3.fit(train.drop(columns='Price'), training_scores_encoded)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=100, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [275, 300, 325], 'max_features': [9], 'max_depth': array([11, 12, 13, 14])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [81]:
clf3.best_params_

{'max_depth': 11, 'max_features': 9, 'n_estimators': 300}

In [89]:
clf3.scorer_()

TypeError: _passthrough_scorer() missing 1 required positional argument: 'estimator'

### Подбор фич

In [36]:
possible_feats = list(train.columns.values)
possible_feats.remove('Price')
possible_feats.remove('Id')

In [105]:
train.corr()['Price']

Id               0.019332
DistrictId       0.264105
Rooms            0.570405
Square           0.107811
LifeSquare       0.068150
KitchenSquare    0.020301
Floor            0.114788
HouseFloor       0.109997
HouseYear        0.035236
Ecology_1       -0.064129
Social_1         0.265278
Social_2         0.240089
Social_3         0.076234
Helthcare_2      0.253601
Shops_1          0.183545
Price            1.000000
mean_price_dr    0.813599
mean_price_r     0.575864
Ecology_2_A     -0.030568
Ecology_2_B      0.030568
Ecology_3_A      0.051992
Ecology_3_B     -0.051992
Shops_2_A       -0.061129
Shops_2_B        0.061129
LastFloor       -0.005087
Name: Price, dtype: float64

In [39]:
possible_feats = train.corr()['Price'][abs(train.corr()['Price']) > 0.25].index
possible_feats = list(possible_feats)
possible_feats.remove('Price')

In [40]:
print(possible_feats)

['DistrictId', 'Rooms', 'Social_1', 'Helthcare_2', 'mean_price_dr', 'mean_price_r']


In [43]:
possible_feats.append('Square')
possible_feats.append('LifeSquare')
possible_feats.append('Floor')
possible_feats.append('LastFloor')

#### PCA

In [47]:
from sklearn.decomposition import PCA

In [48]:
def do_pca(pca_comp, train=train, valid=valid):
    pca_feats = list('abcdefghijklmnopqrstuvwxyz'[0:pca_comp])
    pca = PCA(n_components=pca_comp)
    fit = pca.fit(train.drop(columns='Price'))
    X_train_pca = fit.transform(train.drop(columns='Price'))
    X_train_pca = pd.DataFrame(X_train_pca, columns=pca_feats)
    X_train_pca['Price'] = train['Price']
    X_valid_pca = fit.transform(valid.drop(columns='Price'))
    X_valid_pca = pd.DataFrame(X_valid_pca, columns=pca_feats)
    X_valid_pca['Price'] = valid.loc[:, 'Price']
    check_forests(pca_feats, train=X_train_pca, valid=X_valid_pca)    

In [None]:
for i in range(5, 15, 2):
    print(f"{i} - is current number of components")
    #do_pca(i)

### 8) Применяем функции обработки данных, чистим тестовые данные

In [None]:
test = pd.read_csv('input/test.csv')
test = prepare_data(test)

### 9) Делаем предсказание на тесте

In [None]:
test['Price'] = pred_evaluate_model(test, calculate_r2='No')

### 10) Записываем предсказания в файл

In [None]:
test.loc[:, ['Id', 'Price']].to_csv('VVashchenkov_predictions.csv', index=None)