# submit generation

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve

from src.features import generate_features
from src.models.model import ModelSick

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
# Считывание данных

sot = pd.read_csv('data/sotrudniki.csv', sep = ';')
rod = pd.read_csv('data/rodstvenniki.csv', sep = ';')
ogrv = pd.read_csv('data/OGRV.csv', sep = ';')
weather = pd.read_csv('data/Weather.csv', sep = '|')

In [3]:
X, y = generate_features(sot, rod, ogrv, weather)

1 (99214, 597)
(99214, 597)
2 (99214, 608)
3 (99214, 608)


In [4]:
best = {'feature_fraction': 0.4,
 'lambda_l1': 12.9,
 'lambda_l2': 14.600000000000001,
 'learning_rate': 0.1,
 'max_bin': 310.0,
 'max_depth': 5.0,
 'min_data_in_leaf': 1200.0,
 'num_leaves': 84.0,
 'path_smooth': 2.325}
best_params = {'num_leaves': int(best['num_leaves'])
            , 'max_bin': int(best['max_bin'])
            , 'max_depth': int(best['max_depth'])
            , 'learning_rate': round(best['learning_rate'], 3)
            , 'path_smooth': round(best['path_smooth'], 3)
            , 'lambda_l1': round(best['lambda_l1'], 3)
            , 'lambda_l2': round(best['lambda_l2'], 3)
            , 'min_data_in_leaf': int(best['min_data_in_leaf'])
            , 'feature_fraction':round(best['feature_fraction'], 3)         
            , 'objective': 'binary' 
            , 'metric': 'auc'
            , 'nthread': 7
             }
nround = 15

params = {i: best_params for i in range(1,13)}
nrounds = {i: nround for i in range(1,13)}

In [5]:
# Деление на трейн и тест для последующего прогноза final

X_train = X[X.date < pd.to_datetime('2019-08-01')]
y_train = y[X.date < pd.to_datetime('2019-08-01')]

X_train = X_train[~ y_train.isna().any(axis=1)]
y_train = y_train[~ y_train.isna().any(axis=1)]

X_test = X[X.date == pd.to_datetime('2019-08-01')]

In [6]:
%autoreload 2

model = ModelSick(params, nrounds, 4, 3)
model.fit(X_train, y_train)

 positive: 6805, number of negative: 56016
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100380
[LightGBM] [Info] Number of data points in the train set: 62821, number of used features: 561
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.108324 -> initscore=-2.107980
[LightGBM] [Info] Start training from score -2.107980
[LightGBM] [Info] Number of positive: 6802, number of negative: 56019
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100380
[LightGBM] [Info] Number of data points in the train set: 62821, number of used features: 561
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.108276 -> initscore=-2.108474
[LightGBM] [Info] Start training from score -2.108474
[LightGBM] [Info] Number of positive: 6869, number of negative: 55952
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100380
[LightGBM] [Info] Number of data points in the train set: 62821, number of used

In [7]:
%autoreload 2
predictions = model.predict(X_test)

In [8]:
# Представление результата работы модели в плоский вид

one_line_sub = pd.DataFrame(columns = ['hash_tab_num','date','target'])
for i in range(1,13):
    temp_result = pd.DataFrame(columns = ['hash_tab_num','date','target'])
    temp_result['hash_tab_num'] = predictions['hash_tab_num']
    temp_result['date'] = pd.to_datetime('2019-09-01') + pd.DateOffset(months=i-1)
    temp_result['target'] = predictions['y_' + str(i)]
    one_line_sub = pd.concat([one_line_sub, temp_result], axis = 0)

In [9]:
# Подготовка файла submission
submission = pd.read_csv('data/submission_check.csv', sep = ';')
submission.date = pd.to_datetime(submission.date, format='%Y-%m-%d')
submission.drop('target', axis =1, inplace = True)
submission.head()

Unnamed: 0,hash_tab_num,date
0,0,2019-09-01
1,0,2019-10-01
2,0,2019-11-01
3,0,2019-12-01
4,0,2020-01-01


In [19]:
one_line_sub['hash_tab_num'] = pd.to_numeric(one_line_sub['hash_tab_num'])
submission_final = pd.merge(submission, one_line_sub, how = 'left', on = ['hash_tab_num','date'])

In [20]:
submission_final.to_csv('my_submission_14.csv', sep=';', index=False)

In [10]:
!pip freeze


aiohttp==3.7.4.post0
appdirs==1.4.4
appnope==0.1.2
async-timeout==3.0.1
attrs==20.3.0
backcall==0.2.0
chardet==4.0.0
clikit==0.6.2
cloudpickle==1.6.0
cmdstanpy==0.9.68
convertdate==2.3.2
crashtest==0.3.1
cycler==0.10.0
Cython==0.29.17
decorator==4.4.2
ephem==3.7.7.1
et-xmlfile==1.0.1
future==0.18.2
hijri-converter==2.1.1
holidays==0.11.1
httpstan==4.4.2
hyperopt==0.2.5
idna==3.1
ipykernel==5.5.3
ipython==7.22.0
ipython-genutils==0.2.0
jedi==0.18.0
joblib==1.0.1
jupyter-client==6.1.12
jupyter-core==4.7.1
kiwisolver==1.3.1
korean-lunar-calendar==0.2.1
lightgbm==3.2.1
LunarCalendar==0.0.9
lz4==3.1.3
marshmallow==3.11.1
matplotlib==3.4.1
multidict==5.1.0
networkx==2.5.1
numpy==1.20.2
openpyxl==3.0.7
pandas==1.2.4
parso==0.8.2
pastel==0.2.1
patsy==0.5.1
pexpect==4.8.0
pickleshare==0.7.5
Pillow==8.2.0
plotly==4.14.3
pmdarima==1.8.0
prompt-toolkit==3.0.18
prophet==1.0.1
ptyprocess==0.7.0
Pygments==2.8.1
pylev==1.3.0
PyMeeus==0.5.11
pyparsing==2.4.7
pysimdjson==3.2.0
pystan==2.19.1.1
python-da