# submit generation

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve

from src.features import generate_features
from src.models.model import ModelSick

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 100)

In [2]:
# Считывание данных

sot = pd.read_csv('data/sotrudniki.csv', sep = ';')
rod = pd.read_csv('data/rodstvenniki.csv', sep = ';')
ogrv = pd.read_csv('data/OGRV.csv', sep = ';')
weather = pd.read_csv('data/Weather.csv', sep = '|')

In [3]:
sot[[c for c in sot.columns if c not in ['hash_tab_num','date','category', 'age', 'is_local','gender','razryad_fact', 'razryad_post', 'work_experience_company',
                     'work_experience_all', 'name_fact_lvl5','education','home_to_work_distance']]]

Unnamed: 0,date_of_birth,name_post_lvl4,name_post_lvl5,prof_post_augment,name_fact_lvl4,prof_fact_augment,married,child_num,work_experience_factory,sick,home_to_work_duration
0,1985,,,Должность_0,,Должность_0,хол/нз,1,9.0,0,
1,1985,,,Должность_0,,Должность_0,хол/нз,1,9.0,0,
2,1985,,,Должность_0,,Должность_0,хол/нз,1,9.0,0,
3,1985,,,Должность_0,,Должность_0,хол/нз,1,9.0,0,
4,1985,,,Должность_0,,Должность_0,хол/нз,1,9.0,0,
...,...,...,...,...,...,...,...,...,...,...,...
99209,1980,,,,,Должность_19,жен/зм,2,,0,6.878333
99210,1982,,,,,Должность_19,разв.,1,,0,
99211,1994,,,,,Должность_19,хол/нз,0,,0,
99212,1998,Отделение_8,передел 7,Должность_56,Отделение_8,Должность_32,хол/нз,0,,1,3.113333


In [4]:
%autoreload 2

X, y = generate_features(sot, rod, ogrv, weather)

1 (99214, 710)
(99214, 730)
2 (99214, 746)
3 (99214, 746)


In [14]:
best = {'feature_fraction': 0.8500000000000001,
 'lambda_l1': 2.8000000000000003,
 'lambda_l2': 5.5,
 'learning_rate': 0.05,
 'max_bin': 985.0,
 'max_depth': 5.0,
 'min_data_in_leaf': 500.0,
 'num_leaves': 61.0,
 'path_smooth': 2.7750000000000004}
best_params = {'num_leaves': int(best['num_leaves'])
            , 'max_bin': int(best['max_bin'])
            , 'max_depth': int(best['max_depth'])
            , 'learning_rate': round(best['learning_rate'], 3)
            , 'path_smooth': round(best['path_smooth'], 3)
            , 'lambda_l1': round(best['lambda_l1'], 3)
            , 'lambda_l2': round(best['lambda_l2'], 3)
            , 'min_data_in_leaf': int(best['min_data_in_leaf'])
            , 'feature_fraction':round(best['feature_fraction'], 3)         
            , 'objective': 'binary' 
            , 'metric': 'auc'
            , 'nthread': 7
             }
nround = 100

params = {i: best_params for i in range(1,13)}
nrounds = {i: nround for i in range(1,13)}

In [15]:
# Деление на трейн и тест для последующего прогноза final

X_train = X[X.date < pd.to_datetime('2019-08-01')]
y_train = y[X.date < pd.to_datetime('2019-08-01')]

X_train = X_train[~ y_train.isna().any(axis=1)]
y_train = y_train[~ y_train.isna().any(axis=1)]

X_test = X[X.date == pd.to_datetime('2019-08-01')]

In [16]:
%autoreload 2

model = ModelSick(params, nrounds, 1, 3)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 7519, number of negative: 55302
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 287502
[LightGBM] [Info] Number of data points in the train set: 62821, number of used features: 705
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119689 -> initscore=-1.995376
[LightGBM] [Info] Start training from score -1.995376
[LightGBM] [Info] Number of positive: 7535, number of negative: 55286
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 287502
[LightGBM] [Info] Number of data points in the train set: 62821, number of used features: 705
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119944 -> initscore=-1.992961
[LightGBM] [Info] Start training from score -1.992961
f1_score_max = 0.4475138121546961
f1_score_max = 0.3614884819846426
f1_score_max = 0.35939196525515743
f1_score_max = 0.33775933609958503
f1_score_max = 0.33416708354177094
f1_score_max = 0.3474025974025973
f1_score_

In [17]:
%autoreload 2
predictions = model.predict(X_test)

In [18]:
# Представление результата работы модели в плоский вид

one_line_sub = pd.DataFrame(columns = ['hash_tab_num','date','target'])
for i in range(1,13):
    temp_result = pd.DataFrame(columns = ['hash_tab_num','date','target'])
    temp_result['hash_tab_num'] = predictions['hash_tab_num']
    temp_result['date'] = pd.to_datetime('2019-09-01') + pd.DateOffset(months=i-1)
    temp_result['target'] = predictions['y_' + str(i)]
    one_line_sub = pd.concat([one_line_sub, temp_result], axis = 0)

In [19]:
# Подготовка файла submission
submission = pd.read_csv('data/submission_check.csv', sep = ';')
submission.date = pd.to_datetime(submission.date, format='%Y-%m-%d')
submission.drop('target', axis =1, inplace = True)
submission.head()

Unnamed: 0,hash_tab_num,date
0,0,2019-09-01
1,0,2019-10-01
2,0,2019-11-01
3,0,2019-12-01
4,0,2020-01-01


In [20]:
one_line_sub['hash_tab_num'] = pd.to_numeric(one_line_sub['hash_tab_num'])
submission_final = pd.merge(submission, one_line_sub, how = 'left', on = ['hash_tab_num','date'])

In [22]:
submission_final.to_csv('my_submission_25.csv', sep=';', index=False)