In [1]:
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool
from sklearn.calibration import calibration_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('./fake_job_postings.csv')

In [3]:
data

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [4]:
data.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

Обработаем колонку salary_range

In [5]:
data['salary_range'][~data['salary_range'].isna()]

6          20000-28000
10       100000-120000
15       120000-150000
23       100000-120000
31         50000-65000
             ...      
17844              0-0
17849     80000-100000
17865      18000-20000
17867      18000-19000
17874     80000-100000
Name: salary_range, Length: 2868, dtype: object

В данных бывают зарплаты 0-0, скорее всего это значит, что работа бесплатная.

In [6]:
def get_salary_bound_aggregator(aggregator, error_value=np.nan):
    def get_bounds_aggregated(salary_range):
        if pd.isnull(salary_range):
            return np.nan
        # check if salary_range has format 'low'-'max'
        if not '-' in salary_range:
            bounds = [salary_range, salary_range]
        else:
            bounds = salary_range.split('-')
            assert len(bounds) == 2, f"salary_range={salary_range} is not formatted"
        
        # Try converting bounds to ints
        try:
            bounds = list(map(int, bounds))
        except ValueError:
            print(f'{salary_range} does not contain integers. Substituted both bounds with NaN')
            bounds = [np.nan, np.nan]
        return aggregator(bounds)
        
    return get_bounds_aggregated

In [7]:
data['min_salary'] = data['salary_range'].apply(get_salary_bound_aggregator(min))
data['max_salary'] = data['salary_range'].apply(get_salary_bound_aggregator(max))
data.drop(columns='salary_range', inplace=True)

9-Dec does not contain integers. Substituted both bounds with NaN
3-Apr does not contain integers. Substituted both bounds with NaN
4-Apr does not contain integers. Substituted both bounds with NaN
Oct-15 does not contain integers. Substituted both bounds with NaN
8-Sep does not contain integers. Substituted both bounds with NaN
4-Jun does not contain integers. Substituted both bounds with NaN
10-Oct does not contain integers. Substituted both bounds with NaN
Oct-20 does not contain integers. Substituted both bounds with NaN
Jun-18 does not contain integers. Substituted both bounds with NaN
10-Oct does not contain integers. Substituted both bounds with NaN
11-Nov does not contain integers. Substituted both bounds with NaN
10-Nov does not contain integers. Substituted both bounds with NaN
10-Oct does not contain integers. Substituted both bounds with NaN
10-Nov does not contain integers. Substituted both bounds with NaN
10-Oct does not contain integers. Substituted both bounds with NaN


Теперь разобьем все фичи на типы. Их нужно будет обрабатывать по-разному.

In [8]:
num_features = [
    'min_salary',
    'max_salary'
]

text_features = [
    'title',
    'company_profile',
    'description',
    'requirements',
    'benefits'
]

cat_features = [
    'function',
    'industry',
    'required_education',
    'required_experience',
    'telecommuting',
    'has_company_logo',
    'has_questions',
    'employment_type',
    'location',
    'department'
]

target = 'fraudulent'

# NaN'ы

Для категориальных фичей наны надо заменить на отдельные значения 'NaN', для численных делать этого не нужно, т.к. catboost умеет их обрабатывать

In [9]:
data[cat_features] = data[cat_features].fillna('NaN')

In [10]:
data[cat_features].isna().sum()

function               0
industry               0
required_education     0
required_experience    0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
location               0
department             0
dtype: int64

# Создание фичей

Численных фичей немного и обрабатывать их не нжуно, потому что мы будем использовать catboost. Для решающих деревьев нормировать признаки не нужно.

Категориальные фичи лучше всего кодировать с помощью target_encoding, который уже реализован в catboost'е, так что их тоже обрабатывать дополнительно не нужно. Можно было бы исправить опечатки, если они есть, но это сложно и скорее всего опечатки тоже несут свою информацию, так что лучше их оставить.

Текстовые фичи можно обработать по-разному, я думаю, что лучше всего использовать Tf-Idf, потому что сделать что-то более сложное не получится из-за малого количества данных. Другой вариант - использовать Bag of embeddings, но это чуть сложнее и стоит сначала посмотреть на качество, которое дает более простой вариант. Текст лучше всего объединить в одно поле.

In [11]:
# Чтобы слова не склеивались при взятии суммы строк
data[text_features] += ' '
data['text_merged'] = data[text_features].fillna('NaN').sum(axis=1)

Так как сейчас мы начнем применять трансформации, которые задействуют весь датасет, а не отдельные строчки, то нужно сделать деление на train/test, чтобы не было data leak.

In [12]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [13]:
data_train.reset_index(inplace=True)
data_test.reset_index(inplace=True)

Теперь продолжим обрабатывать датасет

In [14]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100)
tfidf.fit(data_train['text_merged'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=100,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [15]:
tfidf_column_names = sorted(tfidf.vocabulary_.keys(), key=lambda word: tfidf.vocabulary_[word])

In [16]:
data_train_text = pd.DataFrame(data=tfidf.transform(data_train['text_merged']).todense(), 
                               columns=tfidf_column_names)
data_test_text = pd.DataFrame(data=tfidf.transform(data_test['text_merged']).todense(), 
                              columns=tfidf_column_names)

In [17]:
data_train_merged = pd.concat([data_train ,data_train_text], axis=1)
data_test_merged = pd.concat([data_test ,data_test_text], axis=1)

In [18]:
data_train_merged

Unnamed: 0,index,job_id,title,location,department,company_profile,description,requirements,benefits,telecommuting,...,will be,with,with the,work,working,world,years,you,you will,your
0,6327,6328,Graduates: English Teacher Abroad (Conversatio...,"US, KY, Highland Heights",,We help teachers get safe &amp; secure jobs ab...,"Play with kids, get paid for it :-)Love travel...",University degree required. TEFL / TESOL / CEL...,See job description,0,...,0.000000,0.144153,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,10522,10523,Account Executive - Inside Sales,"US, PA, Philadelphia",,"Founded in 2009 by early LinkedIn employees, P...",PeopleLinx is recruiting someone who is enthus...,Bachelor’s degree and 2+ years of SaaS sales a...,Competitive compensation with uncapped commiss...,0,...,0.039246,0.199013,0.040363,0.054611,0.036235,0.046151,0.069673,0.087978,0.044207,0.103906
2,10159,10160,Customer Service Representative,"CA, ON, London",Schools/Charters,Voyageur is one of Ontario's leading transport...,Together we’re going places! Voyageur Transpo...,Candidates should ideally demonstrate the foll...,Voyageur prides itself on its family-run tradi...,0,...,0.000000,0.101869,0.000000,0.041930,0.000000,0.000000,0.053495,0.000000,0.000000,0.000000
3,13967,13968,Senior JavaScript and C# Developer,"NZ, N, Auckland",Development,HSAGlobal specialises in solutions enabling co...,HSAGlobal are a New Zealand based business spe...,,,0,...,0.082978,0.093507,0.042670,0.028866,0.114920,0.048789,0.036828,0.248020,0.140204,0.000000
4,6512,6513,Project Manager,"US, TX, Dallas",,,"Yes, this is a commission job in which you hav...",,,0,...,0.000000,0.035250,0.000000,0.087057,0.000000,0.000000,0.000000,0.233747,0.070472,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12511,14088,14089,Factory Integration Software Team Leader,"NL, ,",,ClarusApex is an international recruiting comp...,Is software development your passion and would...,"Requirements:MSc, PDEng or PhD in computer sci...",,0,...,0.000000,0.140817,0.028560,0.057962,0.025639,0.032655,0.049299,0.145253,0.031280,0.049014
12512,2081,2082,Sales Team Leader,"PH, 40, Cavite or Alabang area",Sales,If working in a cubical seems like your idea o...,If working in a cubical seems like your idea o...,,What's in it for you?We are not a call center....,0,...,0.086299,0.064833,0.000000,0.020014,0.053120,0.101484,0.000000,0.386920,0.097210,0.152323
12513,8226,8227,Automotive Controls Engineer,"US, MN, Minneapolis",,We Provide Full Time Permanent Positions for m...,(We have more than 1500+ Job openings in our w...,,,0,...,0.000000,0.059126,0.000000,0.073011,0.000000,0.000000,0.093148,0.156828,0.118205,0.000000
12514,7512,7513,English Teacher Abroad (Conversational),"US, CA, Chico",,We help teachers get safe &amp; secure jobs ab...,"Play with kids, get paid for it.Vacancies in A...",University degree required. TEFL / TESOL / CEL...,See job description,0,...,0.000000,0.162647,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Обучение

Теперь все готово и мы можем обучить catboost

Так как датасет несбалансированный, то в качестве метрики для моделей лучше всего использовать ROC-AUC. Порог, по которому мы сможем определять, плохая ли вакансия, можно будет установить позже, выбрав precision/recall tradeoff, который нас устроит.

In [28]:
cat = CatBoostClassifier(2000, cat_features=cat_features, eval_metric='AUC')
val_ds = Pool(data_test_merged[tfidf_column_names + cat_features + num_features], 
              data_test_merged[target], 
              cat_features=cat_features)
train_ds = Pool(data_train_merged[tfidf_column_names + cat_features + num_features], 
                data_train_merged[target],
                cat_features=cat_features)

In [29]:
cat.fit(train_ds, eval_set=val_ds)

Learning rate set to 0.043756
0:	test: 0.8126803	best: 0.8126803 (0)	total: 86.3ms	remaining: 2m 52s
1:	test: 0.8470429	best: 0.8470429 (1)	total: 123ms	remaining: 2m 2s
2:	test: 0.8448674	best: 0.8470429 (1)	total: 150ms	remaining: 1m 39s
3:	test: 0.8617430	best: 0.8617430 (3)	total: 175ms	remaining: 1m 27s
4:	test: 0.8641925	best: 0.8641925 (4)	total: 207ms	remaining: 1m 22s
5:	test: 0.8625717	best: 0.8641925 (4)	total: 237ms	remaining: 1m 18s
6:	test: 0.8619318	best: 0.8641925 (4)	total: 270ms	remaining: 1m 16s
7:	test: 0.8713644	best: 0.8713644 (7)	total: 302ms	remaining: 1m 15s
8:	test: 0.8730449	best: 0.8730449 (8)	total: 322ms	remaining: 1m 11s
9:	test: 0.8746719	best: 0.8746719 (9)	total: 347ms	remaining: 1m 9s
10:	test: 0.8765557	best: 0.8765557 (10)	total: 372ms	remaining: 1m 7s
11:	test: 0.8844267	best: 0.8844267 (11)	total: 397ms	remaining: 1m 5s
12:	test: 0.8897695	best: 0.8897695 (12)	total: 430ms	remaining: 1m 5s
13:	test: 0.8961517	best: 0.8961517 (13)	total: 458ms	rema

122:	test: 0.9723200	best: 0.9723496 (120)	total: 3.53s	remaining: 53.9s
123:	test: 0.9724590	best: 0.9724590 (123)	total: 3.56s	remaining: 53.8s
124:	test: 0.9725948	best: 0.9725948 (124)	total: 3.58s	remaining: 53.7s
125:	test: 0.9726170	best: 0.9726170 (125)	total: 3.62s	remaining: 53.9s
126:	test: 0.9726088	best: 0.9726170 (125)	total: 3.65s	remaining: 53.8s
127:	test: 0.9726014	best: 0.9726170 (125)	total: 3.67s	remaining: 53.7s
128:	test: 0.9726187	best: 0.9726187 (128)	total: 3.7s	remaining: 53.6s
129:	test: 0.9726788	best: 0.9726788 (129)	total: 3.72s	remaining: 53.6s
130:	test: 0.9727191	best: 0.9727191 (130)	total: 3.76s	remaining: 53.6s
131:	test: 0.9728697	best: 0.9728697 (131)	total: 3.78s	remaining: 53.5s
132:	test: 0.9728393	best: 0.9728697 (131)	total: 3.81s	remaining: 53.4s
133:	test: 0.9730425	best: 0.9730425 (133)	total: 3.83s	remaining: 53.4s
134:	test: 0.9730516	best: 0.9730516 (134)	total: 3.86s	remaining: 53.3s
135:	test: 0.9730771	best: 0.9730771 (135)	total: 3.

236:	test: 0.9773418	best: 0.9773418 (236)	total: 6.85s	remaining: 51s
237:	test: 0.9773451	best: 0.9773451 (237)	total: 6.88s	remaining: 50.9s
238:	test: 0.9773368	best: 0.9773451 (237)	total: 6.9s	remaining: 50.9s
239:	test: 0.9773418	best: 0.9773451 (237)	total: 6.93s	remaining: 50.8s
240:	test: 0.9773804	best: 0.9773804 (240)	total: 6.96s	remaining: 50.8s
241:	test: 0.9773887	best: 0.9773887 (241)	total: 6.98s	remaining: 50.7s
242:	test: 0.9773739	best: 0.9773887 (241)	total: 7.01s	remaining: 50.7s
243:	test: 0.9772685	best: 0.9773887 (241)	total: 7.03s	remaining: 50.6s
244:	test: 0.9773195	best: 0.9773887 (241)	total: 7.07s	remaining: 50.6s
245:	test: 0.9774405	best: 0.9774405 (245)	total: 7.1s	remaining: 50.6s
246:	test: 0.9774669	best: 0.9774669 (246)	total: 7.12s	remaining: 50.5s
247:	test: 0.9775417	best: 0.9775417 (247)	total: 7.15s	remaining: 50.5s
248:	test: 0.9775261	best: 0.9775417 (247)	total: 7.17s	remaining: 50.4s
249:	test: 0.9775146	best: 0.9775417 (247)	total: 7.2s	

349:	test: 0.9794247	best: 0.9794725 (347)	total: 10.2s	remaining: 47.9s
350:	test: 0.9794239	best: 0.9794725 (347)	total: 10.2s	remaining: 47.9s
351:	test: 0.9794700	best: 0.9794725 (347)	total: 10.2s	remaining: 47.8s
352:	test: 0.9795054	best: 0.9795054 (352)	total: 10.2s	remaining: 47.8s
353:	test: 0.9796041	best: 0.9796041 (353)	total: 10.3s	remaining: 47.8s
354:	test: 0.9796873	best: 0.9796873 (354)	total: 10.3s	remaining: 47.7s
355:	test: 0.9796980	best: 0.9796980 (355)	total: 10.3s	remaining: 47.7s
356:	test: 0.9796897	best: 0.9796980 (355)	total: 10.4s	remaining: 47.6s
357:	test: 0.9797029	best: 0.9797029 (357)	total: 10.4s	remaining: 47.6s
358:	test: 0.9797465	best: 0.9797465 (358)	total: 10.4s	remaining: 47.6s
359:	test: 0.9797539	best: 0.9797539 (359)	total: 10.4s	remaining: 47.6s
360:	test: 0.9797794	best: 0.9797794 (360)	total: 10.5s	remaining: 47.5s
361:	test: 0.9797358	best: 0.9797794 (360)	total: 10.5s	remaining: 47.5s
362:	test: 0.9797383	best: 0.9797794 (360)	total: 1

469:	test: 0.9806641	best: 0.9806641 (469)	total: 13.8s	remaining: 44.9s
470:	test: 0.9806674	best: 0.9806674 (470)	total: 13.8s	remaining: 44.9s
471:	test: 0.9806649	best: 0.9806674 (470)	total: 13.8s	remaining: 44.8s
472:	test: 0.9806781	best: 0.9806781 (472)	total: 13.9s	remaining: 44.8s
473:	test: 0.9807127	best: 0.9807127 (473)	total: 13.9s	remaining: 44.7s
474:	test: 0.9806962	best: 0.9807127 (473)	total: 13.9s	remaining: 44.7s
475:	test: 0.9807094	best: 0.9807127 (473)	total: 13.9s	remaining: 44.6s
476:	test: 0.9807481	best: 0.9807481 (476)	total: 14s	remaining: 44.6s
477:	test: 0.9807678	best: 0.9807678 (477)	total: 14s	remaining: 44.6s
478:	test: 0.9808065	best: 0.9808065 (478)	total: 14s	remaining: 44.6s
479:	test: 0.9807826	best: 0.9808065 (478)	total: 14.1s	remaining: 44.5s
480:	test: 0.9807917	best: 0.9808065 (478)	total: 14.1s	remaining: 44.5s
481:	test: 0.9807818	best: 0.9808065 (478)	total: 14.1s	remaining: 44.5s
482:	test: 0.9807925	best: 0.9808065 (478)	total: 14.2s	r

589:	test: 0.9822418	best: 0.9822418 (589)	total: 17.3s	remaining: 41.4s
590:	test: 0.9822607	best: 0.9822607 (590)	total: 17.3s	remaining: 41.3s
591:	test: 0.9822566	best: 0.9822607 (590)	total: 17.4s	remaining: 41.3s
592:	test: 0.9822698	best: 0.9822698 (592)	total: 17.4s	remaining: 41.3s
593:	test: 0.9822862	best: 0.9822862 (593)	total: 17.4s	remaining: 41.2s
594:	test: 0.9822986	best: 0.9822986 (594)	total: 17.4s	remaining: 41.2s
595:	test: 0.9822492	best: 0.9822986 (594)	total: 17.5s	remaining: 41.1s
596:	test: 0.9822525	best: 0.9822986 (594)	total: 17.5s	remaining: 41.1s
597:	test: 0.9822434	best: 0.9822986 (594)	total: 17.5s	remaining: 41.1s
598:	test: 0.9822467	best: 0.9822986 (594)	total: 17.6s	remaining: 41.1s
599:	test: 0.9821949	best: 0.9822986 (594)	total: 17.6s	remaining: 41s
600:	test: 0.9821833	best: 0.9822986 (594)	total: 17.6s	remaining: 41s
601:	test: 0.9821809	best: 0.9822986 (594)	total: 17.6s	remaining: 41s
602:	test: 0.9821817	best: 0.9822986 (594)	total: 17.7s	r

708:	test: 0.9825792	best: 0.9826319 (693)	total: 20.6s	remaining: 37.5s
709:	test: 0.9825800	best: 0.9826319 (693)	total: 20.6s	remaining: 37.4s
710:	test: 0.9825932	best: 0.9826319 (693)	total: 20.6s	remaining: 37.4s
711:	test: 0.9826261	best: 0.9826319 (693)	total: 20.7s	remaining: 37.4s
712:	test: 0.9826648	best: 0.9826648 (712)	total: 20.7s	remaining: 37.3s
713:	test: 0.9826829	best: 0.9826829 (713)	total: 20.7s	remaining: 37.3s
714:	test: 0.9826771	best: 0.9826829 (713)	total: 20.7s	remaining: 37.3s
715:	test: 0.9826524	best: 0.9826829 (713)	total: 20.8s	remaining: 37.2s
716:	test: 0.9826582	best: 0.9826829 (713)	total: 20.8s	remaining: 37.2s
717:	test: 0.9826829	best: 0.9826829 (713)	total: 20.8s	remaining: 37.2s
718:	test: 0.9827010	best: 0.9827010 (718)	total: 20.9s	remaining: 37.1s
719:	test: 0.9826969	best: 0.9827010 (718)	total: 20.9s	remaining: 37.1s
720:	test: 0.9827306	best: 0.9827306 (720)	total: 20.9s	remaining: 37.1s
721:	test: 0.9827537	best: 0.9827537 (721)	total: 2

821:	test: 0.9827389	best: 0.9828319 (736)	total: 23.9s	remaining: 34.2s
822:	test: 0.9827413	best: 0.9828319 (736)	total: 23.9s	remaining: 34.2s
823:	test: 0.9827603	best: 0.9828319 (736)	total: 23.9s	remaining: 34.2s
824:	test: 0.9827965	best: 0.9828319 (736)	total: 24s	remaining: 34.1s
825:	test: 0.9827849	best: 0.9828319 (736)	total: 24s	remaining: 34.1s
826:	test: 0.9827956	best: 0.9828319 (736)	total: 24s	remaining: 34s
827:	test: 0.9827504	best: 0.9828319 (736)	total: 24s	remaining: 34s
828:	test: 0.9827520	best: 0.9828319 (736)	total: 24.1s	remaining: 34s
829:	test: 0.9827356	best: 0.9828319 (736)	total: 24.1s	remaining: 34s
830:	test: 0.9827421	best: 0.9828319 (736)	total: 24.1s	remaining: 33.9s
831:	test: 0.9827101	best: 0.9828319 (736)	total: 24.2s	remaining: 33.9s
832:	test: 0.9827166	best: 0.9828319 (736)	total: 24.2s	remaining: 33.9s
833:	test: 0.9827084	best: 0.9828319 (736)	total: 24.2s	remaining: 33.8s
834:	test: 0.9827125	best: 0.9828319 (736)	total: 24.2s	remaining: 

941:	test: 0.9829273	best: 0.9829750 (919)	total: 27.2s	remaining: 30.5s
942:	test: 0.9829109	best: 0.9829750 (919)	total: 27.2s	remaining: 30.5s
943:	test: 0.9829232	best: 0.9829750 (919)	total: 27.3s	remaining: 30.5s
944:	test: 0.9828829	best: 0.9829750 (919)	total: 27.3s	remaining: 30.5s
945:	test: 0.9828911	best: 0.9829750 (919)	total: 27.3s	remaining: 30.4s
946:	test: 0.9829002	best: 0.9829750 (919)	total: 27.3s	remaining: 30.4s
947:	test: 0.9829372	best: 0.9829750 (919)	total: 27.4s	remaining: 30.4s
948:	test: 0.9829882	best: 0.9829882 (948)	total: 27.4s	remaining: 30.3s
949:	test: 0.9829923	best: 0.9829923 (949)	total: 27.4s	remaining: 30.3s
950:	test: 0.9829783	best: 0.9829923 (949)	total: 27.4s	remaining: 30.3s
951:	test: 0.9829627	best: 0.9829923 (949)	total: 27.5s	remaining: 30.2s
952:	test: 0.9829833	best: 0.9829923 (949)	total: 27.5s	remaining: 30.2s
953:	test: 0.9829537	best: 0.9829923 (949)	total: 27.5s	remaining: 30.2s
954:	test: 0.9829849	best: 0.9829923 (949)	total: 2

1055:	test: 0.9826640	best: 0.9830458 (1018)	total: 30.6s	remaining: 27.3s
1056:	test: 0.9826590	best: 0.9830458 (1018)	total: 30.6s	remaining: 27.3s
1057:	test: 0.9826574	best: 0.9830458 (1018)	total: 30.6s	remaining: 27.3s
1058:	test: 0.9826590	best: 0.9830458 (1018)	total: 30.6s	remaining: 27.2s
1059:	test: 0.9826343	best: 0.9830458 (1018)	total: 30.7s	remaining: 27.2s
1060:	test: 0.9826047	best: 0.9830458 (1018)	total: 30.7s	remaining: 27.2s
1061:	test: 0.9826064	best: 0.9830458 (1018)	total: 30.7s	remaining: 27.1s
1062:	test: 0.9826911	best: 0.9830458 (1018)	total: 30.7s	remaining: 27.1s
1063:	test: 0.9826903	best: 0.9830458 (1018)	total: 30.8s	remaining: 27.1s
1064:	test: 0.9826903	best: 0.9830458 (1018)	total: 30.8s	remaining: 27.1s
1065:	test: 0.9826952	best: 0.9830458 (1018)	total: 30.8s	remaining: 27s
1066:	test: 0.9827076	best: 0.9830458 (1018)	total: 30.9s	remaining: 27s
1067:	test: 0.9826862	best: 0.9830458 (1018)	total: 30.9s	remaining: 27s
1068:	test: 0.9826747	best: 0.9

1169:	test: 0.9827471	best: 0.9830458 (1018)	total: 33.7s	remaining: 23.9s
1170:	test: 0.9827528	best: 0.9830458 (1018)	total: 33.7s	remaining: 23.9s
1171:	test: 0.9827314	best: 0.9830458 (1018)	total: 33.7s	remaining: 23.8s
1172:	test: 0.9827347	best: 0.9830458 (1018)	total: 33.8s	remaining: 23.8s
1173:	test: 0.9826631	best: 0.9830458 (1018)	total: 33.8s	remaining: 23.8s
1174:	test: 0.9826673	best: 0.9830458 (1018)	total: 33.8s	remaining: 23.7s
1175:	test: 0.9826845	best: 0.9830458 (1018)	total: 33.8s	remaining: 23.7s
1176:	test: 0.9826623	best: 0.9830458 (1018)	total: 33.9s	remaining: 23.7s
1177:	test: 0.9826212	best: 0.9830458 (1018)	total: 33.9s	remaining: 23.7s
1178:	test: 0.9825866	best: 0.9830458 (1018)	total: 33.9s	remaining: 23.6s
1179:	test: 0.9825899	best: 0.9830458 (1018)	total: 34s	remaining: 23.6s
1180:	test: 0.9825924	best: 0.9830458 (1018)	total: 34s	remaining: 23.6s
1181:	test: 0.9825932	best: 0.9830458 (1018)	total: 34s	remaining: 23.5s
1182:	test: 0.9825907	best: 0.9

1283:	test: 0.9825290	best: 0.9830458 (1018)	total: 38.6s	remaining: 21.5s
1284:	test: 0.9825438	best: 0.9830458 (1018)	total: 38.6s	remaining: 21.5s
1285:	test: 0.9825586	best: 0.9830458 (1018)	total: 38.6s	remaining: 21.4s
1286:	test: 0.9826006	best: 0.9830458 (1018)	total: 38.7s	remaining: 21.4s
1287:	test: 0.9825858	best: 0.9830458 (1018)	total: 38.7s	remaining: 21.4s
1288:	test: 0.9825874	best: 0.9830458 (1018)	total: 38.7s	remaining: 21.4s
1289:	test: 0.9826014	best: 0.9830458 (1018)	total: 38.7s	remaining: 21.3s
1290:	test: 0.9825874	best: 0.9830458 (1018)	total: 38.8s	remaining: 21.3s
1291:	test: 0.9826327	best: 0.9830458 (1018)	total: 38.9s	remaining: 21.3s
1292:	test: 0.9826409	best: 0.9830458 (1018)	total: 38.9s	remaining: 21.3s
1293:	test: 0.9826516	best: 0.9830458 (1018)	total: 39s	remaining: 21.3s
1294:	test: 0.9826426	best: 0.9830458 (1018)	total: 39s	remaining: 21.3s
1295:	test: 0.9826212	best: 0.9830458 (1018)	total: 39.1s	remaining: 21.2s
1296:	test: 0.9826319	best: 0

1393:	test: 0.9825282	best: 0.9830458 (1018)	total: 43s	remaining: 18.7s
1394:	test: 0.9825265	best: 0.9830458 (1018)	total: 43.1s	remaining: 18.7s
1395:	test: 0.9825191	best: 0.9830458 (1018)	total: 43.1s	remaining: 18.6s
1396:	test: 0.9825018	best: 0.9830458 (1018)	total: 43.1s	remaining: 18.6s
1397:	test: 0.9824985	best: 0.9830458 (1018)	total: 43.2s	remaining: 18.6s
1398:	test: 0.9824911	best: 0.9830458 (1018)	total: 43.2s	remaining: 18.6s
1399:	test: 0.9824977	best: 0.9830458 (1018)	total: 43.3s	remaining: 18.5s
1400:	test: 0.9824994	best: 0.9830458 (1018)	total: 43.3s	remaining: 18.5s
1401:	test: 0.9824903	best: 0.9830458 (1018)	total: 43.3s	remaining: 18.5s
1402:	test: 0.9824837	best: 0.9830458 (1018)	total: 43.3s	remaining: 18.4s
1403:	test: 0.9824739	best: 0.9830458 (1018)	total: 43.4s	remaining: 18.4s
1404:	test: 0.9824961	best: 0.9830458 (1018)	total: 43.4s	remaining: 18.4s
1405:	test: 0.9824977	best: 0.9830458 (1018)	total: 43.4s	remaining: 18.4s
1406:	test: 0.9825134	best:

1506:	test: 0.9822344	best: 0.9830458 (1018)	total: 48.1s	remaining: 15.7s
1507:	test: 0.9822204	best: 0.9830458 (1018)	total: 48.1s	remaining: 15.7s
1508:	test: 0.9822015	best: 0.9830458 (1018)	total: 48.2s	remaining: 15.7s
1509:	test: 0.9822352	best: 0.9830458 (1018)	total: 48.3s	remaining: 15.7s
1510:	test: 0.9822072	best: 0.9830458 (1018)	total: 48.3s	remaining: 15.6s
1511:	test: 0.9822303	best: 0.9830458 (1018)	total: 48.4s	remaining: 15.6s
1512:	test: 0.9822237	best: 0.9830458 (1018)	total: 48.4s	remaining: 15.6s
1513:	test: 0.9822278	best: 0.9830458 (1018)	total: 48.5s	remaining: 15.6s
1514:	test: 0.9822196	best: 0.9830458 (1018)	total: 48.5s	remaining: 15.5s
1515:	test: 0.9822525	best: 0.9830458 (1018)	total: 48.6s	remaining: 15.5s
1516:	test: 0.9822121	best: 0.9830458 (1018)	total: 48.6s	remaining: 15.5s
1517:	test: 0.9822031	best: 0.9830458 (1018)	total: 48.7s	remaining: 15.5s
1518:	test: 0.9822261	best: 0.9830458 (1018)	total: 48.8s	remaining: 15.5s
1519:	test: 0.9822525	bes

1617:	test: 0.9820163	best: 0.9830458 (1018)	total: 52.8s	remaining: 12.5s
1618:	test: 0.9820212	best: 0.9830458 (1018)	total: 52.8s	remaining: 12.4s
1619:	test: 0.9820187	best: 0.9830458 (1018)	total: 52.8s	remaining: 12.4s
1620:	test: 0.9820155	best: 0.9830458 (1018)	total: 52.9s	remaining: 12.4s
1621:	test: 0.9820031	best: 0.9830458 (1018)	total: 52.9s	remaining: 12.3s
1622:	test: 0.9819957	best: 0.9830458 (1018)	total: 53s	remaining: 12.3s
1623:	test: 0.9819965	best: 0.9830458 (1018)	total: 53s	remaining: 12.3s
1624:	test: 0.9819718	best: 0.9830458 (1018)	total: 53s	remaining: 12.2s
1625:	test: 0.9819628	best: 0.9830458 (1018)	total: 53.1s	remaining: 12.2s
1626:	test: 0.9819562	best: 0.9830458 (1018)	total: 53.1s	remaining: 12.2s
1627:	test: 0.9819488	best: 0.9830458 (1018)	total: 53.1s	remaining: 12.1s
1628:	test: 0.9819677	best: 0.9830458 (1018)	total: 53.2s	remaining: 12.1s
1629:	test: 0.9819710	best: 0.9830458 (1018)	total: 53.2s	remaining: 12.1s
1630:	test: 0.9819653	best: 0.9

1727:	test: 0.9819101	best: 0.9830458 (1018)	total: 56.7s	remaining: 8.92s
1728:	test: 0.9819027	best: 0.9830458 (1018)	total: 56.7s	remaining: 8.89s
1729:	test: 0.9818690	best: 0.9830458 (1018)	total: 56.8s	remaining: 8.86s
1730:	test: 0.9818484	best: 0.9830458 (1018)	total: 56.8s	remaining: 8.82s
1731:	test: 0.9818756	best: 0.9830458 (1018)	total: 56.8s	remaining: 8.79s
1732:	test: 0.9818525	best: 0.9830458 (1018)	total: 56.9s	remaining: 8.76s
1733:	test: 0.9818739	best: 0.9830458 (1018)	total: 56.9s	remaining: 8.73s
1734:	test: 0.9819027	best: 0.9830458 (1018)	total: 56.9s	remaining: 8.7s
1735:	test: 0.9818912	best: 0.9830458 (1018)	total: 57s	remaining: 8.67s
1736:	test: 0.9818904	best: 0.9830458 (1018)	total: 57s	remaining: 8.63s
1737:	test: 0.9819076	best: 0.9830458 (1018)	total: 57.1s	remaining: 8.6s
1738:	test: 0.9818723	best: 0.9830458 (1018)	total: 57.1s	remaining: 8.57s
1739:	test: 0.9819109	best: 0.9830458 (1018)	total: 57.1s	remaining: 8.54s
1740:	test: 0.9819183	best: 0.9

1839:	test: 0.9819299	best: 0.9830458 (1018)	total: 1m	remaining: 5.23s
1840:	test: 0.9819126	best: 0.9830458 (1018)	total: 1m	remaining: 5.2s
1841:	test: 0.9819554	best: 0.9830458 (1018)	total: 1m	remaining: 5.17s
1842:	test: 0.9819529	best: 0.9830458 (1018)	total: 1m	remaining: 5.13s
1843:	test: 0.9819529	best: 0.9830458 (1018)	total: 1m	remaining: 5.1s
1844:	test: 0.9819488	best: 0.9830458 (1018)	total: 1m	remaining: 5.07s
1845:	test: 0.9819562	best: 0.9830458 (1018)	total: 1m	remaining: 5.04s
1846:	test: 0.9819677	best: 0.9830458 (1018)	total: 1m	remaining: 5.01s
1847:	test: 0.9819727	best: 0.9830458 (1018)	total: 1m	remaining: 4.97s
1848:	test: 0.9819685	best: 0.9830458 (1018)	total: 1m	remaining: 4.94s
1849:	test: 0.9819603	best: 0.9830458 (1018)	total: 1m	remaining: 4.91s
1850:	test: 0.9819595	best: 0.9830458 (1018)	total: 1m	remaining: 4.88s
1851:	test: 0.9819718	best: 0.9830458 (1018)	total: 1m	remaining: 4.85s
1852:	test: 0.9819751	best: 0.9830458 (1018)	total: 1m	remaining: 

1957:	test: 0.9817562	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.36s
1958:	test: 0.9816887	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.33s
1959:	test: 0.9816896	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.3s
1960:	test: 0.9816945	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.27s
1961:	test: 0.9817101	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.24s
1962:	test: 0.9817019	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.2s
1963:	test: 0.9816896	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.17s
1964:	test: 0.9816846	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.14s
1965:	test: 0.9816945	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.1s
1966:	test: 0.9816920	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.07s
1967:	test: 0.9816854	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.04s
1968:	test: 0.9816698	best: 0.9830458 (1018)	total: 1m 3s	remaining: 1.01s
1969:	test: 0.9816665	best: 0.9830458 (1018)	total: 1m 3s	remaining: 974ms
1970:	test: 0.9816665	best: 

<catboost.core.CatBoostClassifier at 0x7f6465a0df10>