# submit generation

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve

from src.features import generate_features
from src.models.model import ModelSick

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
# Считывание данных

sot = pd.read_csv('data/sotrudniki.csv', sep = ';')
rod = pd.read_csv('data/rodstvenniki.csv', sep = ';')
ogrv = pd.read_csv('data/OGRV.csv', sep = ';')
weather = pd.read_csv('data/Weather.csv', sep = '|')

In [3]:
X, y = generate_features(sot, rod, ogrv, weather)

1 (99214, 485)
(99214, 505)
2 (99214, 516)
3 (99214, 516)


In [23]:
best = {'feature_fraction': 0.8,
 'lambda_l1': 12.9,
 'lambda_l2': 14.600000000000001,
 'learning_rate': 0.1,
 'max_bin': 310.0,
 'max_depth': 5.0,
 'min_data_in_leaf': 1200.0,
 'num_leaves': 84.0,
 'path_smooth': 2.325}
best_params = {'num_leaves': int(best['num_leaves'])
            , 'max_bin': int(best['max_bin'])
            , 'max_depth': int(best['max_depth'])
            , 'learning_rate': round(best['learning_rate'], 3)
            , 'path_smooth': round(best['path_smooth'], 3)
            , 'lambda_l1': round(best['lambda_l1'], 3)
            , 'lambda_l2': round(best['lambda_l2'], 3)
            , 'min_data_in_leaf': int(best['min_data_in_leaf'])
            , 'feature_fraction':round(best['feature_fraction'], 3)         
            , 'objective': 'binary' 
            , 'metric': 'auc'
            , 'nthread': 7
             }
nround = 15

params = {i: best_params for i in range(1,13)}
nrounds = {i: nround for i in range(1,13)}

In [26]:
models = []

for i in range(10):
    params["random_state"] = i
    models.append(ModelSick(params, nrounds))

In [25]:
# Деление на трейн и тест для последующего прогноза final

X_train = X[X.date < pd.to_datetime('2019-08-01')]
y_train = y[X.date < pd.to_datetime('2019-08-01')]

X_train = X_train[~ y_train.isna().any(axis=1)]
y_train = y_train[~ y_train.isna().any(axis=1)]

X_test = X[X.date == pd.to_datetime('2019-08-01')]

In [19]:
X_train.shape

(69354, 504)

In [20]:
y_train.shape

(69354, 14)

In [22]:
model.fit(X_train, y_train)

TypeError: fit() got an unexpected keyword argument 'random_state'

In [10]:
predictions = model.predict(X_test)

In [11]:
# Представление результата работы модели в плоский вид

one_line_sub = pd.DataFrame(columns = ['hash_tab_num','date','target'])
for i in range(1,13):
    temp_result = pd.DataFrame(columns = ['hash_tab_num','date','target'])
    temp_result['hash_tab_num'] = predictions['hash_tab_num']
    temp_result['date'] = pd.to_datetime('2019-09-01') + pd.DateOffset(months=i-1)
    temp_result['target'] = predictions['y_' + str(i)]
    one_line_sub = pd.concat([one_line_sub, temp_result], axis = 0)

In [12]:
# Подготовка файла submission
submission = pd.read_csv('data/submission_check.csv', sep = ';')
submission.date = pd.to_datetime(submission.date, format='%Y-%m-%d')
submission.drop('target', axis =1, inplace = True)
submission.head()

Unnamed: 0,hash_tab_num,date
0,0,2019-09-01
1,0,2019-10-01
2,0,2019-11-01
3,0,2019-12-01
4,0,2020-01-01


In [13]:
one_line_sub['hash_tab_num'] = pd.to_numeric(one_line_sub['hash_tab_num'])
submission_final = pd.merge(submission, one_line_sub, how = 'left', on = ['hash_tab_num','date'])

In [14]:
submission_final.to_csv('my_submission_7.csv', sep=';', index=False)

# Feature analyses

In [None]:
gain = model.models[2].feature_importance(importance_type='gain')
summ = sum(gain)
df_gain = pd.DataFrame({'Name':list(X_train.drop(['hash_tab_num', 'date'], axis=1).columns), 'Value': gain/summ}).sort_values(by='Value', ascending=False).reset_index()
df_gain.head(20)

In [18]:
list(X_train.columns)

['hash_tab_num',
 'date',
 'age',
 'is_local',
 'gender',
 'work_experience_company',
 'work_experience_all',
 'home_to_work_distance',
 'work_experience_all_stage_0',
 'work_experience_all_stage_1',
 'work_experience_all_stage_2',
 'work_experience_all_stage_3',
 'personel_num',
 'category_Rabochie',
 'category_Rukovoditeli',
 'category_Sluzhaschie',
 'category_Spetsialisty',
 'education_Vysshee',
 'education_Nachalnoe_srednee',
 'education_Srednee_professinalnoe',
 'razryad_fact_0',
 'razryad_fact_1',
 'razryad_fact_2',
 'razryad_fact_3',
 'razryad_fact_4',
 'razryad_fact_5',
 'razryad_fact_6',
 'razryad_post_0',
 'razryad_post_1',
 'razryad_post_2',
 'razryad_post_3',
 'razryad_post_4',
 'razryad_post_5',
 'razryad_post_6',
 'rale_is_old',
 'young_children_6_cnt',
 'young_children_11_cnt',
 'young_children_6_female_cnt',
 'young_children_11_female_cnt',
 'work_shift_type_count',
 'work_shift_type_count_cummax',
 'work_shift_type_count_cummean',
 'work_shift_type_count_rolling_mean_2

In [22]:
X.to_csv("X.csv")

In [15]:
print("1")

1


In [None]:
print("1")