# [Boosters] Raiffeisen Data Cup. Baseline
Общий подход:
- Добавляем к каждой транзакции столбец: is_work (если транзакция находится в пределах 0.02 от дома клиента)
- Добавляем к каждой транзакции столбец: is_home (если транзакция находится в пределах 0.02 от работы клиента)
- Обучаем классификатор предсказывающий вероятность (is_home == 1) для транзакции
- Обучаем классификатор предсказывающий вероятность (is_work == 1) для транзакции

Точность определения местоположения:
- для классификатора is_home: ~3x%
- для классификатора is_work: ~2x%
- общая оценка на Public Leaderboard: ???

Примечание
* Требуется Python версии 3.5
* Требуется библиотека xgboost (для обучения использовалась xgboost версии 0.7.post3)
* Требуются файлы: test_set.csv, train_set.csv в одном каталоге с данным скриптом
* Требования к памяти: должно работать с 2Гб свободного RAM
* Время работы: ~3 минуты (тестировалось на процессоре Intel Core i7-4770)

In [29]:
import pandas as pd
import numpy as np
import datetime
import pickle

from catboost import CatBoostClassifier
import sklearn

from sklearn.model_selection import train_test_split

In [88]:
# Определим типы колонок для экономии памяти
dtypes = {
    'transaction_date': str,
    'atm_address': str,
    'country': str,
    'city': str,
    'amount': np.float32,
    'currency': np.float32,
    'mcc': str,
    'customer_id': str,
    'pos_address': str,
    'atm_address': str,
    'pos_adress_lat': np.float64,
    'pos_adress_lon': np.float64,
    'pos_address_lat': np.float64,
    'pos_address_lon': np.float64,
    'atm_address_lat': np.float64,
    'atm_address_lon': np.float64,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
}

# для экономии памяти будем загружать только часть атрибутов транзакций
usecols_train = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_adress_lat', 'pos_adress_lon', 'atm_address_lat', 'atm_address_lon','home_add_lat','home_add_lon','work_add_lat','work_add_lon']
usecols_test = ['customer_id','transaction_date','amount','country', 'city', 'currency', 'mcc', 'pos_address_lat', 'pos_address_lon', 'atm_address_lat', 'atm_address_lon']

## Читаем train_set, test_set, соединяем в один датасет

In [185]:
train = pd.read_csv('train_set.csv', dtype = dtypes, usecols = usecols_train)
train.rename(columns = {'pos_adress_lat': 'pos_address_lat', 'pos_adress_lon': 'pos_address_lon'}, inplace = True)

test = pd.read_csv('test_set.csv', dtype = dtypes, usecols = usecols_test)
submission = pd.DataFrame(test['customer_id'].unique(), columns = ['_ID_'])


# соединяем test/train в одном DataFrame
train['is_train'] = np.int32(1)
test['is_train'] = np.int32(0)
dt = pd.concat([train, test])

del train, test

In [4]:
#train = date_upgrade(train)
#train[train['transaction_date'].apply(lambda x: True if x.year==2016 else False)]

### Обрабатываем дату транзакции и категориальные признаки

In [5]:
def date_upgrade(dt):    
    dt['currency'] = dt['currency'].fillna(-1).astype(np.int32)
    dt['mcc'] = dt['mcc'].apply(lambda x: int(x.replace(',', ''))).astype(np.int32)
    dt['city'] = dt['city'].factorize()[0].astype(np.int32)
    dt['country'] = dt['country'].factorize()[0].astype(np.int32)

    # удаляем транзакции без даты
    dt.drop(dt[dt['transaction_date'].isnull()].index, axis = 0, inplace = True)
    dt['transaction_date'] = dt['transaction_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
    
    return dt

### Бизнесс календарь для 2017 года

In [6]:
bdays = pd.read_csv('bdaycaltruetrue.csv', header = None)
bdays = [datetime.datetime.strptime(x, '%Y-%m-%d') for x in bdays[0].tolist()]

### Фичи для даты

In [7]:
def data_features(dt):
    dt['weekday'] = dt['transaction_date'].dt.weekday.astype(np.int32)
    dt['busnessday'] = dt['transaction_date'].apply(lambda x: 0 if x in bdays else 1).factorize()[0].astype(np.int32)
    
    return dt

### Наличие машины

In [8]:
car = [5541,5511,5531,5533,5532]

def car_feature(dt):
    dt['car'] = dt['mcc'].apply(lambda x: 1 if x in car else 0).factorize()[0].astype(np.int32)
    #dt['car'] = dt.apply['car'](lambda x: 1 if x in dt[dt['car']==1]['customer_id'].tolist() else 0).astype(np.int32)
    
    return dt

### Приводим адрес транзакции для pos и atm-транзакций к единообразному виду

In [9]:
def pos_atm(dt):
    dt['is_atm'] = (~dt['atm_address_lat'].isnull()).astype(np.int32)
    dt['is_pos'] = (~dt['pos_address_lat'].isnull()).astype(np.int32)

    dt['address_lat'] = dt['atm_address_lat'].fillna(0) + dt['pos_address_lat'].fillna(0)
    dt['address_lon'] = dt['atm_address_lon'].fillna(0) + dt['pos_address_lon'].fillna(0)

    dt.drop(['atm_address_lat','atm_address_lon','pos_address_lat','pos_address_lon'], axis = 1, inplace = True)

    # удалим транзакции без адреса
    dt.drop(dt[((dt['address_lon'] == 0) & (dt['address_lon'] == 0))].index, axis = 0, inplace = True)
    
    return dt

### Генерируем целевые переменные

In [10]:
def generating_target(dt):
    lat = dt['home_add_lat'] - dt['address_lat']
    lon = dt['home_add_lon'] - dt['address_lon']
    dt['is_home'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
    dt['has_home'] = (~dt['home_add_lon'].isnull()).astype(np.int32)

    lat = dt['work_add_lat'] - dt['address_lat']
    lon = dt['work_add_lon'] - dt['address_lon']
    dt['is_work'] = (np.sqrt((lat ** 2) + (lon ** 2)) <= 0.02).astype(np.int32)
    dt['has_work'] = (~dt['work_add_lon'].isnull()).astype(np.int32)

    dt.drop(['work_add_lat','work_add_lon','home_add_lat','home_add_lon'], axis = 1, inplace = True)
    
    return dt

### Генерируем категориальный признак для адреса

In [11]:
def address(dt):
    dt['address'] = dt['address_lat'].apply(lambda x: "%.02f" % x) + ';' + dt['address_lon'].apply(lambda x: "%.02f" % x)
    dt['address'] = dt['address'].factorize()[0].astype(np.int32)
    
    return dt

### Генерируем несколько абонентских фич

In [12]:
def add_features(dt):    
    # количество транзакций каждого клиента
    dt = dt.merge(dt.groupby('customer_id')['amount'].count().reset_index(name = 'tx'), how = 'left')
    dt['tx'] = dt['tx'].astype(np.int32)

    dt = dt.merge(dt.groupby(['customer_id','address'])['amount'].count().reset_index(name = 'tx_cust_addr'), how = 'left')
    dt['tx_cust_addr'] = dt['tx_cust_addr'].astype(np.int32)

    # какая часть транзакций клиента приходится на данный адрес
    dt['ratio1'] = dt['tx_cust_addr'] / dt['tx']
    dt['city'] = 
    
    return dt

In [151]:
coords.head()

Unnamed: 0,address_lat,address_lon,0,1,2,3,4,5,6,7,8,9,10
0,59.844072,30.179153,Nan,О’КЕЙ,2,улица Партизана Германа,Лигово,округ Урицк,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
1,59.858198,30.229024,Nan,О’Кей,31 к1,проспект Маршала Жукова,Юго-Запад,округ Юго-Запад,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
2,54.982358,82.892559,Nan,Гранит,5,площадь Карла Маркса,Ленинский район,Новосибирск,городской округ Новосибирск,Новосибирская область,СФО,630000,РФ
3,55.026997,82.920634,Nan,Своя компания,17,Красный проспект,Центральный район,Новосибирск,городской округ Новосибирск,Новосибирская область,СФО,630000,РФ
4,54.964984,82.928976,Nan,Nan,48,улица Ватутина,Кировский район,Новосибирск,городской округ Новосибирск,Новосибирская область,СФО,630000,РФ


In [150]:
coords = coords.drop('string', 1)

In [140]:
coords['string'] = coords['string'].apply(lambda x: 'Nan,'+x if len(x.split(','))<11 else ''+x)
coords['string'] = coords['string'].apply(lambda x: 'Nan,'+x if len(x.split(','))<11 else ''+x)
coords['string'] = coords['string'].apply(lambda x: 'Nan,'+x if len(x.split(','))<11 else ''+x)
coords['string'] = coords['string'].apply(lambda x: 'Nan,'+x if len(x.split(','))<11 else ''+x)

In [145]:
e=coords['string']
e = e.apply(lambda x: pd.Series(x.split(',')))

In [146]:
e = e.drop(e.columns[[11,12,13,14,15,16,17,18]], axis=1)

In [148]:
coords = pd.concat([coords, e], axis=1)

In [152]:
e.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Nan,О’КЕЙ,2,улица Партизана Германа,Лигово,округ Урицк,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
1,Nan,О’Кей,31 к1,проспект Маршала Жукова,Юго-Запад,округ Юго-Запад,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
2,Nan,Гранит,5,площадь Карла Маркса,Ленинский район,Новосибирск,городской округ Новосибирск,Новосибирская область,СФО,630000,РФ
3,Nan,Своя компания,17,Красный проспект,Центральный район,Новосибирск,городской округ Новосибирск,Новосибирская область,СФО,630000,РФ
4,Nan,Nan,48,улица Ватутина,Кировский район,Новосибирск,городской округ Новосибирск,Новосибирская область,СФО,630000,РФ


In [110]:
coords['string'] = coords['string'].apply(lambda x: pd.DataFrame(x.split(',')))

In [101]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4156417 entries, 0 to 4156416
Data columns (total 21 columns):
amount              float32
city                int32
country             int32
currency            int32
customer_id         object
is_train            int32
mcc                 int32
transaction_date    datetime64[ns]
weekday             int32
busnessday          int32
car                 int32
is_atm              int32
is_pos              int32
address_lat         float64
address_lon         float64
is_home             int32
has_home            int32
is_work             int32
has_work            int32
address             int32
string              object
dtypes: datetime64[ns](1), float32(1), float64(2), int32(15), object(2)
memory usage: 444.0+ MB


In [136]:
with open('train_coords.pcl', 'rb') as f:
   coords_train = pickle.load(f, encoding='latin1')

with open('test_coords.pcl', 'rb') as f:
   coords_test = pickle.load(f, encoding='latin1')

coords = pd.concat([coords_train, coords_test])
coords.columns = ['address_lat', 'address_lon','string']

In [187]:
dt = pd.merge(dt, coords, how='left', on=['address_lat', 'address_lon'])

In [73]:
dt[dt.string.apply(lambda x: type(x)==str)]

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,busnessday,...,address_lon,is_home,has_home,is_work,has_work,address,tx,tx_cust_addr,ratio1,string
973007,4.494472,4384,0,643,24485c68fc0dc95e576e4663e79c25ac,1,6011,2017-08-09,2,1,...,61.375,0,1,1,1,2214,234,15,0.064103,"16А, улица Рылеева, посёлок Уфимские Каменные ..."
973018,3.848732,4384,0,643,24485c68fc0dc95e576e4663e79c25ac,1,6011,2017-10-13,4,1,...,61.375,0,1,1,1,2214,234,15,0.064103,"16А, улица Рылеева, посёлок Уфимские Каменные ..."
2058454,3.978321,4171,0,643,8e7c42ec8d07250973267b3bf0f68f30,0,6011,2017-04-04,1,1,...,45.125,0,0,0,0,20398,107,5,0.046729,"5, улица Октябрьская, Ялга, городской округ Са..."
2058459,3.683675,4171,0,643,8e7c42ec8d07250973267b3bf0f68f30,0,6011,2017-04-03,0,1,...,45.125,0,0,0,0,20398,107,5,0.046729,"5, улица Октябрьская, Ялга, городской округ Са..."
2058500,4.174926,4171,0,643,7b63dd0323bd213224ad5eb32d096cf0,0,6011,2017-06-05,0,1,...,45.125,0,0,0,0,20398,60,2,0.033333,"5, улица Октябрьская, Ялга, городской округ Са..."
2153373,3.894478,4269,0,643,a84e118144186736fd63fc7284041aa5,0,6011,2017-07-15,5,0,...,56.25,0,0,0,0,5821,51,5,0.098039,"56, Комсомольский проспект, Громова, Свердловс..."


In [161]:
dt.tail()

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,busnessday,...,1,2,3,4,5,6,7,8,9,10
4156412,3.84062,4314,0,643,813ba66e2908a79c3b7ce3caa5ea1f0d,0,6011,2017-03-26,6,0,...,4,18684,426,4,211,102,41,5,5459,0
4156413,3.84062,4314,0,643,813ba66e2908a79c3b7ce3caa5ea1f0d,0,6011,2017-03-26,6,0,...,4,18684,426,4,211,102,41,5,5459,0
4156414,4.082534,4314,0,643,01dd11d12be651d9a5129ea15ad45318,0,6011,2017-06-12,0,0,...,4,18685,14985,4,211,102,41,5,273,0
4156415,2.606452,4314,0,643,881dc77f6fc5a4bf50755243e0043f5c,0,6011,2017-10-31,1,1,...,4,18684,426,4,211,102,41,5,5459,0
4156416,2.606452,4314,0,643,881dc77f6fc5a4bf50755243e0043f5c,0,6011,2017-10-31,1,1,...,4,18684,426,4,211,102,41,5,5459,0


In [188]:
dt.head()

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,busnessday,...,1,2,3,4,5,6,7,8,9,10
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,0,...,О’КЕЙ,2,улица Партизана Германа,Лигово,округ Урицк,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
1,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,0,...,О’КЕЙ,2,улица Партизана Германа,Лигово,округ Урицк,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
2,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,4,1,...,О’КЕЙ,2,улица Партизана Германа,Лигово,округ Урицк,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
3,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,4,1,...,О’КЕЙ,2,улица Партизана Германа,Лигово,округ Урицк,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ
4,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,1,1,...,О’Кей,31 к1,проспект Маршала Жукова,Юго-Запад,округ Юго-Запад,Красносельский район,Санкт-Петербург,Северо-Западный федеральный округ,190000,РФ


In [189]:
dt[0] = dt[0].factorize()[0].astype(np.int32)
dt[1] = dt[1].factorize()[0].astype(np.int32)
dt[2] = dt[2].factorize()[0].astype(np.int32)
dt[3] = dt[3].factorize()[0].astype(np.int32)
dt[4] = dt[4].factorize()[0].astype(np.int32)
dt[5] = dt[5].factorize()[0].astype(np.int32)
dt[6] = dt[6].factorize()[0].astype(np.int32)
dt[7] = dt[7].factorize()[0].astype(np.int32)
dt[8] = dt[8].factorize()[0].astype(np.int32)
dt[9] = dt[9].factorize()[0].astype(np.int32)
dt[10] = dt[10].factorize()[0].astype(np.int32)

In [13]:
###Соединяем фичи

In [186]:
date_upgrade(dt)
data_features(dt)
car_feature(dt)
pos_atm(dt)
generating_target(dt)
address(dt)
add_features(dt)

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,busnessday,...,address_lat,address_lon,is_home,has_home,is_work,has_work,address,tx,tx_cust_addr,ratio1
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,0,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
1,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,4,1,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
2,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,1,1,...,59.858198,30.229024,1,1,0,1,1,39,5,0.128205
3,2.787498,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-09-09,5,0,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
4,2.892510,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-06,3,1,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
5,2.909018,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-08-23,2,1,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
6,2.801228,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-29,5,0,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
7,2.838200,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,0,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
8,3.264740,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-08-18,4,1,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615
9,3.118792,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-05-13,5,0,...,59.844072,30.179153,0,1,1,1,0,39,15,0.384615


## Вспомогательные функции для оценки точности классификатора

In [15]:
def _best(x):
    ret = None
    for col in ys:
        pred = ('pred:%s' % col)
        if pred in x:
            i = (x[pred].idxmax())
            cols = [pred,'address_lat','address_lon']
            if col in x:
                cols.append(col)
            tmp = x.loc[i,cols]
            tmp.rename({
                'address_lat':'%s:add_lat' % col,
                'address_lon':'%s:add_lon' % col,
            }, inplace = True)
            if ret is None:
                ret = tmp
            else:
                ret = pd.concat([ret, tmp])
    return ret

In [16]:
def predict_proba(dt, ys = ['is_home', 'is_work']):
    for col in ys:
        pred = ('pred:%s' % col)
        dt[pred] = model[col].predict_proba(dt[xs])[:,1]
    return dt.groupby('customer_id').apply(_best).reset_index()

In [17]:
def score(dt, ys = ['is_home', 'is_work']):
    dt_ret = predict_proba(dt, ys)
    mean = 0.0
    for col in ys:
        col_mean = dt_ret[col].mean()
        mean += col_mean
    if len(ys) == 2:
        mean = mean / len(ys)
    return mean

### Признаки, на которых будем обучать модель

In [190]:
#xs = ['amount','currency','city','country','mcc','is_atm','is_pos','ratio1','weekday','busnessday', 'car', '1']
xs = [i for i in dt.columns.tolist() if i not in ['is_home', 'is_work','customer_id','transaction_date','is_train','address','transaction_date']]
ys = ['is_home', 'is_work']

In [183]:
dt.head()

Unnamed: 0,amount,city,country,currency,customer_id,is_train,mcc,transaction_date,weekday,busnessday,...,4,5,6,7,8,9,10,tx,tx_cust_addr,ratio1
0,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,0,...,0,0,0,0,0,0,0,74,30,0.405405
1,2.884034,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-07-15,5,0,...,0,0,0,0,0,0,0,74,30,0.405405
2,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,4,1,...,0,0,0,0,0,0,0,74,30,0.405405
3,2.775633,0,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5261,2017-10-27,4,1,...,0,0,0,0,0,0,0,74,30,0.405405
4,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,1,5992,2017-10-03,1,1,...,1,1,0,0,0,0,0,74,9,0.121622


In [19]:
#good_col = ['amount', 'atm_address_lat', 'atm_address_lon', 'currency', 'mcc', 'pos_address_lat','pos_address_lon']

In [20]:
#good_col = [i for i in dt.columns.tolist() if i not in ['target_home', 'target_work','customer_id','transaction_date']] 

In [21]:
#good_col_test = [i for i in dt.columns.tolist() if i not in ['customer_id','transaction_date']]

# Создаем классификаторы
**Hint**: можно поигратьcя с гиперпараметрами для лучшего результата :)

In [165]:
model0 = {
    'is_home': CatBoostClassifier(n_estimators = 100),
    'is_work': CatBoostClassifier(n_estimators = 100),
}

In [176]:
dt = add_features(dt)

In [177]:
dt.columns

Index([          'amount',             'city',          'country',
               'currency',      'customer_id',         'is_train',
                    'mcc', 'transaction_date',          'weekday',
             'busnessday',              'car',           'is_atm',
                 'is_pos',      'address_lat',      'address_lon',
                'is_home',         'has_home',          'is_work',
               'has_work',          'address',                  0,
                        1,                  2,                  3,
                        4,                  5,                  6,
                        7,                  8,                  9,
                       10,               'tx',     'tx_cust_addr',
                 'ratio1'],
      dtype='object')

# Обучаем классификаторы

In [191]:
model = {}

# последовательно обучаем два классификатора
for col in ['is_home', 'is_work']:
    
    #выберем для обучение транзакции только тех клиентов из train, у которых хоть в одной транзакции указано место работы/жительства
    cust_train = dt[dt['is_train'] == 1].groupby('customer_id')[col.replace('is_','has_')].max()
    cust_train = cust_train[cust_train > 0].index
    
    #разобъем train на train/valid для валидации
    cust_train, cust_valid = train_test_split(cust_train, test_size = 0.1, shuffle = True, random_state = 2)
    
    train = pd.DataFrame(cust_train, columns = ['customer_id']).merge(dt, how = 'left')
    valid = pd.DataFrame(cust_valid, columns = ['customer_id']).merge(dt, how = 'left')

    print ("Training:", col)
    clf = sklearn.base.clone(model0[col])
    clf.fit(train[xs], train[col], verbose=True)
    model[col] = clf
    print ("Train accuracy:", score(train, ys = [col]))
    print ("Test accuracy:", score(valid, ys = [col]))
    print ()


Training: is_home
0:	learn: 0.6801137	total: 905ms	remaining: 1m 29s
1:	learn: 0.6679625	total: 1.82s	remaining: 1m 29s
2:	learn: 0.6566551	total: 2.69s	remaining: 1m 27s
3:	learn: 0.6461486	total: 3.6s	remaining: 1m 26s
4:	learn: 0.6364405	total: 4.46s	remaining: 1m 24s
5:	learn: 0.6273382	total: 5.34s	remaining: 1m 23s
6:	learn: 0.6189396	total: 6.13s	remaining: 1m 21s
7:	learn: 0.6110869	total: 6.9s	remaining: 1m 19s
8:	learn: 0.6038468	total: 7.68s	remaining: 1m 17s
9:	learn: 0.5969779	total: 8.52s	remaining: 1m 16s
10:	learn: 0.5906354	total: 9.32s	remaining: 1m 15s
11:	learn: 0.5847763	total: 10.1s	remaining: 1m 14s
12:	learn: 0.5793304	total: 10.9s	remaining: 1m 12s
13:	learn: 0.5741485	total: 11.7s	remaining: 1m 11s
14:	learn: 0.5693968	total: 12.5s	remaining: 1m 10s
15:	learn: 0.5644624	total: 13.3s	remaining: 1m 9s
16:	learn: 0.5605186	total: 14.2s	remaining: 1m 9s
17:	learn: 0.5567231	total: 15.1s	remaining: 1m 8s
18:	learn: 0.5530721	total: 15.9s	remaining: 1m 7s
19:	learn:

60:	learn: 0.4059578	total: 26.1s	remaining: 16.7s
61:	learn: 0.4053362	total: 26.5s	remaining: 16.2s
62:	learn: 0.4045944	total: 26.9s	remaining: 15.8s
63:	learn: 0.4041032	total: 27.3s	remaining: 15.4s
64:	learn: 0.4035788	total: 27.7s	remaining: 14.9s
65:	learn: 0.4029613	total: 28.2s	remaining: 14.5s
66:	learn: 0.4024336	total: 28.6s	remaining: 14.1s
67:	learn: 0.4019699	total: 29s	remaining: 13.6s
68:	learn: 0.4013854	total: 29.4s	remaining: 13.2s
69:	learn: 0.4008786	total: 29.9s	remaining: 12.8s
70:	learn: 0.4003640	total: 30.3s	remaining: 12.4s
71:	learn: 0.3999225	total: 30.8s	remaining: 12s
72:	learn: 0.3993800	total: 31.3s	remaining: 11.6s
73:	learn: 0.3989524	total: 31.8s	remaining: 11.2s
74:	learn: 0.3983471	total: 32.2s	remaining: 10.7s
75:	learn: 0.3977580	total: 32.6s	remaining: 10.3s
76:	learn: 0.3972660	total: 33s	remaining: 9.85s
77:	learn: 0.3968712	total: 33.4s	remaining: 9.42s
78:	learn: 0.3965215	total: 33.8s	remaining: 8.99s
79:	learn: 0.3961064	total: 34.2s	rem

# Predict

In [192]:
cust_test = dt[dt['is_train'] == 0]['customer_id'].unique()
test = pd.DataFrame(cust_test, columns = ['customer_id']).merge(dt, how = 'left')
test = predict_proba(test)
test.rename(columns = {
        'customer_id':'_ID_',
        'is_home:add_lat': '_HOME_LAT_',
        'is_home:add_lon': '_HOME_LON_',
        'is_work:add_lat': '_WORK_LAT_',
        'is_work:add_lon': '_WORK_LON_'}, inplace = True)
test = test[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]


# Формируем submission-файл

In [193]:
# Заполняем пропуски
submission = submission.merge(test, how = 'left').fillna(0)

# Пишем файл submission
submission.to_csv('baseline-very-simple111.csv', index = False)