In [5]:
import pandas as pd
import numpy as np
import json 
import os
import requests
import seaborn as sns

from pprint import pprint
from bs4 import BeautifulSoup 

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate, learning_curve
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.metrics import mean_absolute_error
# from sklearn.metrics import mean_absolute_percentage_error

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [6]:
%pylab inline

pd.set_option('display.max_rows', 70) # выведем больше строк
pd.set_option('display.max_columns', 30) # выведем больше колонок

Populating the interactive namespace from numpy and matplotlib


In [20]:
def num_opt(row):
    if row['sample']==1:
        return train_num_opt(row.Комплектация)
    if row['sample']==0:
        return test_num_opt(row.Комплектация)

def test_num_opt(config):
    decod_config = json.loads(config[2:-2]) if config != '[]' else []
    return sum([len(opt_cat['values']) for opt_cat in decod_config])

def train_num_opt(config):
    point = "'available_options': "
    start = config.find(point)+len(point)+1
    finish = config.find("]",start)
    return len(config[start:finish].split(", "))



def preproc_sfdata(data):
    import re
    
    df = data.copy().drop(columns=['id','start_date', 'hidden', 'model','engineDisplacement'])
    
    cat_cols = ['bodyType','color','fuelType','name','numberOfDoors','vehicleConfiguration',
            'vehicleTransmission','Привод','Владельцы','ПТС']
    
    # bodyType
    df.bodyType = df.bodyType.dropna().apply(lambda x: x.split()[0]).str.lower()
    
    # brand
    df.drop(columns = ['brand'],inplace=True)
    
    # color
    color_codes = {
    '040001': 'чёрный',
    'FAFBFB': 'белый', 
    '0000CC': 'синий', 
    '200204': 'коричневый', 
    'EE1D19': 'красный', 
    'CACECB': 'серый',
    'C49648': 'бежевый', 
    '97948F': 'серебристый', 
    'FFD600': 'золотистый', 
    'FF8649': 'оранжевый', 
    '22A0F8': 'голубой',
    'FFC0CB': 'пурпурный', 
    'DEA522': 'жёлтый', 
    '007F00': 'зелёный', 
    '660099': 'фиолетовый',
    '4A2197': 'фиолетовый'}
    df.color = df.color.map(color_codes)
    
    # fuelType
    to_drop = df.index[df.fuelType=='универсал']
    df.drop(index=to_drop,inplace=True)
    
    # modelDate
    df.modelDate = df.modelDate.dropna().astype(int)
    
    # name
    df.name = df.name.apply(lambda x: x[:x.find(' ')])
    
    # numberOfDoors
    df.numberOfDoors = df.numberOfDoors.astype(int)
    
    # productionDate
    data.productionDate = data.productionDate.astype(int)

    
    # vehicleConfiguration
    df.vehicleConfiguration = df.vehicleConfiguration.str.split().apply(lambda x:x[0])
    
    # vehicleTransmission
    transmission_dict = {
    'автоматическая':'автоматическая',
    'механическая':'механическая',
    'роботизированная':'роботизированная',
    'MECHANICAL':'механическая', 
    'AUTOMATIC':'автоматическая', 
    'ROBOT':'роботизированная', 
    'VARIATOR':'автоматическая'}
    df.vehicleTransmission = df.vehicleTransmission.map(transmission_dict)
    
    # engineDisplacement
#     df.engineDisplacement = df.engineDisplacement.apply(lambda x: x.split()[0])
    
    # enginePower
    df.enginePower = df.enginePower.apply(lambda x: int(x[:-4]) if type(x)==str else int(x)) 

    # description
    df.description = df.description.str.len().fillna(-999)
    
    # mileage
    df.mileage = df.mileage 
    
    # Комплектация
    df.Комплектация = df.apply(num_opt,axis=1)
    
    # Привод
#     df.Привод = pd.get_dummies(df.Привод)
    
    # Руль
    df.drop(columns = ['Руль'],inplace=True)
    
    # Состояние
    df.drop(columns = ['Состояние'],inplace=True)
    
    # Владельцы
    df.Владельцы = df.Владельцы.fillna(0).apply(lambda x: int(x[0]) if type(x) == str else int(x))
    
    # ПТС
    owner_dict = {
    'Оригинал':'Оригинал',
    'Дубликат':'Дубликат',
    'ORIGINAL':'Оригинал',
    'DUPLICATE':'Дубликат'}
    df.ПТС = df.ПТС.fillna('Оригинал').map(owner_dict)
    
    # Таможня
    df.drop(columns = ['Таможня'],inplace=True)
    
    # Владение
    df['ownrshp_know'] = df.Владение.notna().astype(int)
    pattern_yaer = re.compile('\d+(?= (?:год|лет))')
    pattern_month = re.compile('\d+(?= мес)')
    year = lambda x: int(pattern_yaer.search(x).group(0)) if pattern_yaer.search(x) else 0
    month = lambda x: int(pattern_month.search(x).group(0)) if pattern_month.search(x) else 0
    df.Владение = df.Владение.apply(lambda x: year(x)*12 + month(x) if x==x else x).fillna(-999.)
    
    # dummy-кодирование категориальных признаков
    df = pd.get_dummies(df,columns=cat_cols)
    
    return df

def validation(X,y,model):
    '''
    Валидация модели
    '''
    model = model
#     cv_results = cross_validate(model, X, y, scoring='neg_mean_absolute_percentage_error', cv=3,)
    cv_results = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=3,)    
    return -cv_results['test_score'].mean()

In [21]:
RANDOM_SEED = 42

In [22]:
train = pd.read_csv('./Project_5_data/all_auto_ru_09_09_2020.csv')
test = pd.read_csv('./Project_5_data/test.csv')

# y = train.price
# train.drop(columns = ['price'])

# удаление отброшенных наблюдений
train.dropna(subset=['price'],inplace=True)
train.dropna(subset=['bodyType'],inplace=True)

train['sample'] = 1  # помечаем где у нас трейн
test['sample'] = 0   # помечаем где у нас тест
test['price'] = np.NaN  # в тесте нет значения price -  пока просто заполняем np.NaN

train_reduced = train.loc[train.brand=='BMW']

data = test.append(train_reduced, sort=False).reset_index(drop=True) # объединяем
# big_data = test.append(train, sort=False).reset_index(drop=True) # объединяем

In [23]:
train.columns

Index(['bodyType', 'brand', 'color', 'fuelType', 'modelDate', 'name',
       'numberOfDoors', 'productionDate', 'vehicleConfiguration',
       'vehicleTransmission', 'engineDisplacement', 'enginePower',
       'description', 'mileage', 'Комплектация', 'Привод', 'Руль', 'Состояние',
       'Владельцы', 'ПТС', 'Таможня', 'Владение', 'price', 'start_date',
       'hidden', 'model', 'sample'],
      dtype='object')

In [24]:
X = preproc_sfdata(data)

### Baseline
Обучим на подготовленных данных модель LogisticRegression() спараметрами по умолчанию и получим значение метрики. Мы будем рассматривать эту модель как первую итерацию, первое приближение которое нужно улучшить.

In [25]:
X_train = X[X['sample']==1].drop(columns=['sample','price'])
X_test = X[X['sample']==0].drop(columns=['sample','price'])

y_train = X[X['sample']==1].price

In [26]:
validation(X_train, y_train, RandomForestRegressor())



461089.21286554536

In [27]:
validation(X_train, y_train, GradientBoostingRegressor())

436497.98194508004

In [28]:
validation(X_train, y_train, AdaBoostRegressor())

898941.7227620971

In [29]:
validation(X_train, y_train, XGBRegressor())

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




438209.2355100136

In [30]:
validation(X_train, y_train, CatBoostRegressor())

Learning rate set to 0.05181
0:	learn: 2288889.7973948	total: 1.77ms	remaining: 1.77s
1:	learn: 2190146.7420624	total: 3.35ms	remaining: 1.67s
2:	learn: 2100442.8263745	total: 4.9ms	remaining: 1.63s
3:	learn: 2013183.3489542	total: 6.46ms	remaining: 1.61s
4:	learn: 1932589.4355999	total: 8.01ms	remaining: 1.59s
5:	learn: 1856350.1965514	total: 9.54ms	remaining: 1.58s
6:	learn: 1779487.4572632	total: 11.2ms	remaining: 1.59s
7:	learn: 1707371.4894954	total: 13ms	remaining: 1.62s
8:	learn: 1635709.7617404	total: 14.7ms	remaining: 1.61s
9:	learn: 1572755.8801832	total: 16.2ms	remaining: 1.6s
10:	learn: 1510699.0264275	total: 17.8ms	remaining: 1.6s
11:	learn: 1453000.8389328	total: 19.3ms	remaining: 1.59s
12:	learn: 1395619.6969706	total: 20.9ms	remaining: 1.59s
13:	learn: 1342481.5786235	total: 22.5ms	remaining: 1.58s
14:	learn: 1295204.7413103	total: 24ms	remaining: 1.58s
15:	learn: 1249786.9675634	total: 25.6ms	remaining: 1.58s
16:	learn: 1204734.2710009	total: 27.4ms	remaining: 1.58s
17

202:	learn: 375404.5457286	total: 328ms	remaining: 1.29s
203:	learn: 373556.1355494	total: 330ms	remaining: 1.29s
204:	learn: 373342.9411582	total: 331ms	remaining: 1.28s
205:	learn: 372688.9462671	total: 333ms	remaining: 1.28s
206:	learn: 372228.4438285	total: 335ms	remaining: 1.28s
207:	learn: 371751.1291426	total: 336ms	remaining: 1.28s
208:	learn: 371492.1393729	total: 338ms	remaining: 1.28s
209:	learn: 370322.2015809	total: 340ms	remaining: 1.28s
210:	learn: 369543.1566783	total: 342ms	remaining: 1.28s
211:	learn: 368678.0753215	total: 343ms	remaining: 1.28s
212:	learn: 368267.0509798	total: 345ms	remaining: 1.27s
213:	learn: 367873.8544636	total: 347ms	remaining: 1.27s
214:	learn: 366475.0400955	total: 348ms	remaining: 1.27s
215:	learn: 366115.5322980	total: 350ms	remaining: 1.27s
216:	learn: 365643.6950984	total: 352ms	remaining: 1.27s
217:	learn: 364809.8712246	total: 353ms	remaining: 1.27s
218:	learn: 364561.8348551	total: 357ms	remaining: 1.27s
219:	learn: 364160.3553566	tota

406:	learn: 300618.7290730	total: 662ms	remaining: 964ms
407:	learn: 300453.2630770	total: 663ms	remaining: 962ms
408:	learn: 300198.0323706	total: 665ms	remaining: 961ms
409:	learn: 299751.9891827	total: 667ms	remaining: 959ms
410:	learn: 299687.5862261	total: 668ms	remaining: 958ms
411:	learn: 299433.8260612	total: 670ms	remaining: 956ms
412:	learn: 299310.0999700	total: 671ms	remaining: 954ms
413:	learn: 299120.0541532	total: 673ms	remaining: 953ms
414:	learn: 299076.5309961	total: 675ms	remaining: 951ms
415:	learn: 298606.1376755	total: 676ms	remaining: 949ms
416:	learn: 297941.7140938	total: 678ms	remaining: 948ms
417:	learn: 297544.2614718	total: 697ms	remaining: 971ms
418:	learn: 297382.8147158	total: 705ms	remaining: 978ms
419:	learn: 297270.9987766	total: 710ms	remaining: 981ms
420:	learn: 297087.1904211	total: 714ms	remaining: 983ms
421:	learn: 296535.0482383	total: 718ms	remaining: 984ms
422:	learn: 296452.3249778	total: 721ms	remaining: 984ms
423:	learn: 295948.5864774	tota

587:	learn: 265436.6279926	total: 995ms	remaining: 697ms
588:	learn: 265337.4605417	total: 997ms	remaining: 696ms
589:	learn: 265284.5948973	total: 998ms	remaining: 694ms
590:	learn: 265147.3396373	total: 1000ms	remaining: 692ms
591:	learn: 265078.0323957	total: 1s	remaining: 690ms
592:	learn: 265035.0656484	total: 1s	remaining: 688ms
593:	learn: 264966.9348241	total: 1s	remaining: 687ms
594:	learn: 264825.9013321	total: 1.01s	remaining: 685ms
595:	learn: 264749.5980796	total: 1.01s	remaining: 684ms
596:	learn: 264329.9508138	total: 1.01s	remaining: 683ms
597:	learn: 263957.9037918	total: 1.01s	remaining: 681ms
598:	learn: 263750.2036491	total: 1.01s	remaining: 679ms
599:	learn: 263657.1915453	total: 1.02s	remaining: 678ms
600:	learn: 263583.2769479	total: 1.02s	remaining: 676ms
601:	learn: 263565.3463527	total: 1.02s	remaining: 674ms
602:	learn: 263523.5675352	total: 1.02s	remaining: 672ms
603:	learn: 263389.0113134	total: 1.02s	remaining: 671ms
604:	learn: 263163.5977895	total: 1.02s

789:	learn: 241186.5909987	total: 1.32s	remaining: 352ms
790:	learn: 241177.0802221	total: 1.33s	remaining: 350ms
791:	learn: 241122.7996920	total: 1.33s	remaining: 349ms
792:	learn: 240952.8773041	total: 1.33s	remaining: 347ms
793:	learn: 240887.5578443	total: 1.33s	remaining: 345ms
794:	learn: 240728.3618897	total: 1.33s	remaining: 344ms
795:	learn: 240719.5362010	total: 1.33s	remaining: 342ms
796:	learn: 240675.4754776	total: 1.33s	remaining: 340ms
797:	learn: 240626.3087718	total: 1.34s	remaining: 339ms
798:	learn: 240561.0718924	total: 1.34s	remaining: 337ms
799:	learn: 240523.1319528	total: 1.34s	remaining: 335ms
800:	learn: 240303.6283366	total: 1.34s	remaining: 334ms
801:	learn: 240282.4073279	total: 1.34s	remaining: 332ms
802:	learn: 240177.6503400	total: 1.35s	remaining: 330ms
803:	learn: 240069.1706948	total: 1.35s	remaining: 329ms
804:	learn: 239635.0687116	total: 1.35s	remaining: 327ms
805:	learn: 239554.7463243	total: 1.35s	remaining: 325ms
806:	learn: 239479.3366351	tota

994:	learn: 222007.7363887	total: 1.66s	remaining: 8.33ms
995:	learn: 221879.6151057	total: 1.66s	remaining: 6.66ms
996:	learn: 221841.0569453	total: 1.66s	remaining: 5ms
997:	learn: 221833.3746657	total: 1.66s	remaining: 3.33ms
998:	learn: 221778.0789974	total: 1.66s	remaining: 1.67ms
999:	learn: 221751.7438170	total: 1.67s	remaining: 0us
Learning rate set to 0.05181
0:	learn: 2048604.0377758	total: 1.91ms	remaining: 1.91s
1:	learn: 1963210.8227706	total: 3.68ms	remaining: 1.83s
2:	learn: 1880664.6205089	total: 5.22ms	remaining: 1.73s
3:	learn: 1801019.5878975	total: 7.04ms	remaining: 1.75s
4:	learn: 1724919.5994568	total: 8.73ms	remaining: 1.74s
5:	learn: 1650021.6596789	total: 10.3ms	remaining: 1.71s
6:	learn: 1579576.6604840	total: 11.9ms	remaining: 1.68s
7:	learn: 1512966.5521007	total: 13.5ms	remaining: 1.67s
8:	learn: 1450808.2480741	total: 15.1ms	remaining: 1.66s
9:	learn: 1391265.4729824	total: 16.6ms	remaining: 1.65s
10:	learn: 1333513.2737418	total: 18.4ms	remaining: 1.65s
1

163:	learn: 291585.8453399	total: 265ms	remaining: 1.35s
164:	learn: 291378.8266269	total: 266ms	remaining: 1.35s
165:	learn: 290896.7357698	total: 268ms	remaining: 1.34s
166:	learn: 290559.4483214	total: 269ms	remaining: 1.34s
167:	learn: 290297.1407350	total: 271ms	remaining: 1.34s
168:	learn: 290052.7609267	total: 273ms	remaining: 1.34s
169:	learn: 289618.2278900	total: 275ms	remaining: 1.34s
170:	learn: 289433.1225279	total: 277ms	remaining: 1.34s
171:	learn: 289115.1672524	total: 278ms	remaining: 1.34s
172:	learn: 288936.3875932	total: 280ms	remaining: 1.34s
173:	learn: 288708.3373498	total: 281ms	remaining: 1.33s
174:	learn: 288507.3157692	total: 283ms	remaining: 1.33s
175:	learn: 288256.3074349	total: 284ms	remaining: 1.33s
176:	learn: 287567.6325860	total: 286ms	remaining: 1.33s
177:	learn: 286575.2500175	total: 288ms	remaining: 1.33s
178:	learn: 285880.4003511	total: 289ms	remaining: 1.32s
179:	learn: 285450.4742024	total: 291ms	remaining: 1.32s
180:	learn: 285277.0359149	tota

372:	learn: 235508.6300436	total: 595ms	remaining: 1s
373:	learn: 235396.1001270	total: 597ms	remaining: 1000ms
374:	learn: 235181.3159962	total: 599ms	remaining: 998ms
375:	learn: 235137.1184064	total: 601ms	remaining: 997ms
376:	learn: 234973.1161717	total: 602ms	remaining: 995ms
377:	learn: 234838.7808318	total: 604ms	remaining: 994ms
378:	learn: 234449.8320116	total: 606ms	remaining: 993ms
379:	learn: 234275.1517296	total: 608ms	remaining: 992ms
380:	learn: 234237.1709562	total: 609ms	remaining: 990ms
381:	learn: 234110.7025435	total: 611ms	remaining: 988ms
382:	learn: 233927.1734281	total: 613ms	remaining: 987ms
383:	learn: 233742.1851311	total: 614ms	remaining: 985ms
384:	learn: 233621.2817540	total: 616ms	remaining: 984ms
385:	learn: 233296.6884113	total: 618ms	remaining: 983ms
386:	learn: 233153.9918966	total: 619ms	remaining: 981ms
387:	learn: 233016.0241032	total: 621ms	remaining: 979ms
388:	learn: 232753.9104459	total: 622ms	remaining: 978ms
389:	learn: 232717.6973040	total:

579:	learn: 208752.2852212	total: 929ms	remaining: 673ms
580:	learn: 208654.9941820	total: 931ms	remaining: 672ms
581:	learn: 208617.5154791	total: 933ms	remaining: 670ms
582:	learn: 208258.7068222	total: 934ms	remaining: 668ms
583:	learn: 208060.0663216	total: 936ms	remaining: 667ms
584:	learn: 207877.3834142	total: 937ms	remaining: 665ms
585:	learn: 207684.2132058	total: 940ms	remaining: 664ms
586:	learn: 207446.6603544	total: 941ms	remaining: 662ms
587:	learn: 207437.0398736	total: 943ms	remaining: 661ms
588:	learn: 207380.8979634	total: 945ms	remaining: 659ms
589:	learn: 207365.9085365	total: 947ms	remaining: 658ms
590:	learn: 207145.1391226	total: 949ms	remaining: 656ms
591:	learn: 206983.9223966	total: 950ms	remaining: 655ms
592:	learn: 206642.3123560	total: 952ms	remaining: 653ms
593:	learn: 206632.9359568	total: 954ms	remaining: 652ms
594:	learn: 206495.1753560	total: 955ms	remaining: 650ms
595:	learn: 206487.3377739	total: 957ms	remaining: 649ms
596:	learn: 206398.4097849	tota

780:	learn: 190793.8336602	total: 1.26s	remaining: 354ms
781:	learn: 190784.6933176	total: 1.26s	remaining: 352ms
782:	learn: 190685.3842201	total: 1.26s	remaining: 351ms
783:	learn: 190567.3767027	total: 1.27s	remaining: 349ms
784:	learn: 190527.1631003	total: 1.27s	remaining: 348ms
785:	learn: 190462.4626414	total: 1.27s	remaining: 346ms
786:	learn: 190356.2842943	total: 1.27s	remaining: 344ms
787:	learn: 190337.2258784	total: 1.27s	remaining: 343ms
788:	learn: 190250.8056466	total: 1.27s	remaining: 341ms
789:	learn: 190246.4906976	total: 1.28s	remaining: 340ms
790:	learn: 190233.7220706	total: 1.28s	remaining: 338ms
791:	learn: 190128.6803939	total: 1.28s	remaining: 336ms
792:	learn: 190076.8944059	total: 1.28s	remaining: 335ms
793:	learn: 190044.9252264	total: 1.28s	remaining: 333ms
794:	learn: 189989.6284809	total: 1.28s	remaining: 332ms
795:	learn: 189901.0269597	total: 1.29s	remaining: 330ms
796:	learn: 189896.8679798	total: 1.29s	remaining: 328ms
797:	learn: 189808.3084056	tota

986:	learn: 177270.6295091	total: 1.59s	remaining: 20.9ms
987:	learn: 177250.7443803	total: 1.59s	remaining: 19.3ms
988:	learn: 177234.3264700	total: 1.59s	remaining: 17.7ms
989:	learn: 177134.0959256	total: 1.59s	remaining: 16.1ms
990:	learn: 177103.5195061	total: 1.6s	remaining: 14.5ms
991:	learn: 177100.0797801	total: 1.6s	remaining: 12.9ms
992:	learn: 177042.7224635	total: 1.6s	remaining: 11.3ms
993:	learn: 177031.5053941	total: 1.6s	remaining: 9.67ms
994:	learn: 177006.2194066	total: 1.6s	remaining: 8.06ms
995:	learn: 176960.5045040	total: 1.6s	remaining: 6.45ms
996:	learn: 176932.5085202	total: 1.61s	remaining: 4.83ms
997:	learn: 176905.1503681	total: 1.61s	remaining: 3.22ms
998:	learn: 176860.3028551	total: 1.61s	remaining: 1.61ms
999:	learn: 176848.7730741	total: 1.61s	remaining: 0us
Learning rate set to 0.05181
0:	learn: 1678223.3381652	total: 2.58ms	remaining: 2.58s
1:	learn: 1614629.4497089	total: 4.85ms	remaining: 2.42s
2:	learn: 1553640.3027014	total: 7.27ms	remaining: 2.4

135:	learn: 337447.0679523	total: 228ms	remaining: 1.45s
136:	learn: 336835.3182745	total: 229ms	remaining: 1.45s
137:	learn: 335869.6211283	total: 231ms	remaining: 1.44s
138:	learn: 335215.6839267	total: 235ms	remaining: 1.45s
139:	learn: 334608.3637942	total: 236ms	remaining: 1.45s
140:	learn: 334430.9308954	total: 238ms	remaining: 1.45s
141:	learn: 333022.2711584	total: 240ms	remaining: 1.45s
142:	learn: 332716.0163292	total: 241ms	remaining: 1.45s
143:	learn: 331201.4318921	total: 243ms	remaining: 1.45s
144:	learn: 330416.0348917	total: 245ms	remaining: 1.44s
145:	learn: 330068.8793470	total: 247ms	remaining: 1.44s
146:	learn: 329646.6273874	total: 248ms	remaining: 1.44s
147:	learn: 328981.5801582	total: 250ms	remaining: 1.44s
148:	learn: 328289.6341760	total: 251ms	remaining: 1.44s
149:	learn: 327702.4933324	total: 253ms	remaining: 1.43s
150:	learn: 327123.1853451	total: 255ms	remaining: 1.43s
151:	learn: 326666.8600773	total: 256ms	remaining: 1.43s
152:	learn: 325803.4941219	tota

333:	learn: 248029.2835458	total: 561ms	remaining: 1.12s
334:	learn: 247866.0430682	total: 563ms	remaining: 1.12s
335:	learn: 247471.5861138	total: 564ms	remaining: 1.11s
336:	learn: 247275.8994895	total: 566ms	remaining: 1.11s
337:	learn: 246935.1661172	total: 568ms	remaining: 1.11s
338:	learn: 246647.0540218	total: 569ms	remaining: 1.11s
339:	learn: 246475.7634371	total: 571ms	remaining: 1.11s
340:	learn: 245975.3933602	total: 573ms	remaining: 1.11s
341:	learn: 245491.6000515	total: 575ms	remaining: 1.1s
342:	learn: 245251.6464275	total: 577ms	remaining: 1.1s
343:	learn: 245116.9652214	total: 578ms	remaining: 1.1s
344:	learn: 244672.4281471	total: 580ms	remaining: 1.1s
345:	learn: 244375.2772274	total: 582ms	remaining: 1.1s
346:	learn: 244189.4573484	total: 583ms	remaining: 1.1s
347:	learn: 244015.6638645	total: 586ms	remaining: 1.1s
348:	learn: 243828.1178820	total: 588ms	remaining: 1.1s
349:	learn: 243738.2957522	total: 589ms	remaining: 1.09s
350:	learn: 243611.3497284	total: 591ms

510:	learn: 217999.7367422	total: 887ms	remaining: 849ms
511:	learn: 217744.8645148	total: 889ms	remaining: 847ms
512:	learn: 217695.6758662	total: 890ms	remaining: 845ms
513:	learn: 217630.4410965	total: 892ms	remaining: 844ms
514:	learn: 217611.2508277	total: 894ms	remaining: 842ms
515:	learn: 217552.7643346	total: 896ms	remaining: 840ms
516:	learn: 217497.3867320	total: 898ms	remaining: 839ms
517:	learn: 217442.3100097	total: 899ms	remaining: 837ms
518:	learn: 217389.4448827	total: 901ms	remaining: 835ms
519:	learn: 217254.4862048	total: 903ms	remaining: 833ms
520:	learn: 217210.0956030	total: 904ms	remaining: 831ms
521:	learn: 217180.3048171	total: 906ms	remaining: 829ms
522:	learn: 217122.3307257	total: 908ms	remaining: 828ms
523:	learn: 216760.5832931	total: 910ms	remaining: 827ms
524:	learn: 216743.0629730	total: 912ms	remaining: 825ms
525:	learn: 216611.8337466	total: 913ms	remaining: 823ms
526:	learn: 216555.2689483	total: 915ms	remaining: 821ms
527:	learn: 216266.7457766	tota

711:	learn: 194205.6821897	total: 1.22s	remaining: 494ms
712:	learn: 194125.0559239	total: 1.22s	remaining: 492ms
713:	learn: 194073.5654574	total: 1.22s	remaining: 490ms
714:	learn: 194063.7164060	total: 1.23s	remaining: 489ms
715:	learn: 193994.7942422	total: 1.23s	remaining: 487ms
716:	learn: 193901.5486151	total: 1.23s	remaining: 485ms
717:	learn: 193796.9503414	total: 1.23s	remaining: 483ms
718:	learn: 193788.2278053	total: 1.23s	remaining: 482ms
719:	learn: 193695.7178232	total: 1.23s	remaining: 480ms
720:	learn: 193630.4659429	total: 1.24s	remaining: 478ms
721:	learn: 193499.6481691	total: 1.24s	remaining: 477ms
722:	learn: 193390.7981077	total: 1.24s	remaining: 475ms
723:	learn: 193347.7594405	total: 1.24s	remaining: 474ms
724:	learn: 193235.9662208	total: 1.24s	remaining: 472ms
725:	learn: 193114.8702283	total: 1.25s	remaining: 470ms
726:	learn: 193077.3690432	total: 1.25s	remaining: 469ms
727:	learn: 192978.0197515	total: 1.25s	remaining: 467ms
728:	learn: 192831.4444333	tota

901:	learn: 179281.6042083	total: 1.55s	remaining: 169ms
902:	learn: 179256.1822810	total: 1.55s	remaining: 167ms
903:	learn: 179212.3167662	total: 1.56s	remaining: 165ms
904:	learn: 179174.2608201	total: 1.56s	remaining: 164ms
905:	learn: 179076.4139531	total: 1.56s	remaining: 162ms
906:	learn: 178968.9504433	total: 1.56s	remaining: 160ms
907:	learn: 178956.9132339	total: 1.56s	remaining: 158ms
908:	learn: 178883.5233976	total: 1.56s	remaining: 157ms
909:	learn: 178834.2471298	total: 1.57s	remaining: 155ms
910:	learn: 178673.4268936	total: 1.57s	remaining: 153ms
911:	learn: 178583.0733100	total: 1.57s	remaining: 152ms
912:	learn: 178543.4698947	total: 1.57s	remaining: 150ms
913:	learn: 178428.2907486	total: 1.57s	remaining: 148ms
914:	learn: 178404.3877697	total: 1.58s	remaining: 147ms
915:	learn: 178373.5969157	total: 1.58s	remaining: 145ms
916:	learn: 178326.7539951	total: 1.58s	remaining: 143ms
917:	learn: 178221.1916381	total: 1.58s	remaining: 141ms
918:	learn: 178175.9733501	tota

445848.88166745304

## Отбор признаков
Благодаря созданию новых признаков и их дальнейшей обработке в наборе уже несколько десятков признаков. Наверное уже имеет смысл попробовать отбросить те из них, что неполезны для модели - это вероятно улучшит качество и точно ускорит обучение.

Прибегнем к отбору при помощи Lasso . Обучим эту модель на обогащенном по признакам и оптимизированном по количеству наблюдений наборе данных.

In [31]:
'''
from sklearn.linear_model import Lasso

selector = Lasso(alpha=0.0001)
selector.fit(X_train,y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

Теперь получим список наименее полезных с точки зрения модели признков

sorted_by_Lasso = abs(pd.Series(selector.coef_,index = X_train.columns)).sort_values()

unuseful_features_LR = sorted_by_Lasso[sorted_by_Lasso<0.5].index

unuseful_features_LR

sorted_by_Lasso
''';

### PCA

In [32]:
scaler = StandardScaler()

In [33]:
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.fit_transform(X_test)


In [34]:
# pca = PCA(n_components = 1)
pca = PCA(0.95)
# XPCAreduced = pca.fit_transform(transpose(X))

In [35]:
%%time
X_train_pca = pca.fit_transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)


Wall time: 130 ms


In [36]:
pca.n_components_

172

In [37]:
validation(X_train_pca, y_train, RandomForestRegressor())



914990.1340453388

In [38]:
validation(X_train_pca, y_train, GradientBoostingRegressor())

856840.338493982

In [39]:
validation(X_train_pca, y_train, AdaBoostRegressor())

1413691.2935284644

## Подбор гиперпараметров

In [40]:
logspace(-6,-1,6)

array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01])

In [41]:
hyperparameters = {
#                    'loss' : ['ls', 'lad', 'huber', 'quantile'], 
                   'learning_rate ': [0.01,0.1]
#                    'n_estimators ': [100, 300, 500], 
#                    'max_depth':  [3,4,5],
#                    'min_samples_split ': [2,3,5],
#                    'min_samples_leaf  ': [1,2,3,5],
#                    'subsample' : [1],
#                    'max_features  ': ['auto', 'sqrt', 'log2']
}

In [42]:
hyperparameters

{'learning_rate ': [0.01, 0.1]}

In [43]:
%%time
n_iter = 2
search = pd.Series()

model = XGBRegressor()
clf = RandomizedSearchCV(model, hyperparameters, n_iter=n_iter, 
                         scoring='neg_mean_absolute_error', cv=3, random_state=RANDOM_SEED)

Wall time: 961 µs


In [44]:
search = clf.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [45]:
search.best_score_

-438209.2355100136

## Подсобное

## ПОДВАЛ

RandomForestRegressor
GradientBoostingRegressor
AdaBoostRegressor

In [12]:
pca

PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [13]:
pca_2

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)