In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb


from tqdm import tqdm


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
train_path = '/content/drive/MyDrive/Data/AlphaHac/train.parquet'
test_path = '/content/drive/MyDrive/Data/AlphaHac/test.parquet'
sample_submission_path = '/content/drive/MyDrive/Data/AlphaHac/sample_submission.csv'

train = pd.read_parquet(train_path).drop('id', axis=1).drop_duplicates()
test = pd.read_parquet(test_path).drop('id', axis=1)
sample_submission = pd.read_csv(sample_submission_path)

In [33]:
drop_list = ['city', 'index_city_code', 'branch_code']
hueta_list = ['max_end_plan_non_fin_deals', 'max_start_fin_deals', 'max_start_non_fin_deals', 'min_end_fact_fin_deals', 'min_start_fin_deals', 'min_start_non_fin_deals', 'max_founderpres', 'min_founderpres', 'max_end_fact_fin_deals', 'min_end_plan_non_fin_deals']

In [34]:
train = train.drop(drop_list, axis=1)
test = test.drop(drop_list, axis=1)

for ft in hueta_list:
  train.loc[train[ft].isna(), ft] = 0
  test.loc[train[ft].isna(), ft] = 0

In [35]:
cities_types = ['3597', '1252', '727', '5418', '3844']

train.loc[~train['city_type'].isin(cities_types), 'city_type'] = 0
test.loc[~test['city_type'].isin(cities_types), 'city_type'] = 0

In [36]:
channel_code_types = ['7', '4', '30', '26', '32', '40', '34', '33', '10', '37', '31', '48', '29', '2', '27', '11', '46', '22', '18', '20']

train.loc[~train['channel_code'].isin(channel_code_types), 'channel_code'] = 0
test.loc[~test['channel_code'].isin(channel_code_types), 'channel_code'] = 0

In [37]:
train = train[train['rko_start_months'] > -800]

In [38]:
vbr_fts =  ['rko_start_months', 'balance_amt_avg', 'balance_amt_max',
       'balance_amt_min','balance_amt_day_avg', 'ogrn_days_end_month',
    'ft_registration_date', 'sum_of_paym_2m', 'sum_of_paym_6m', 'sum_of_paym_1y',
       'sum_a_oper_1m', 'cnt_a_oper_1m', 'sum_b_oper_1m', 'cnt_b_oper_1m',
       'sum_c_oper_1m', 'cnt_c_oper_1m', 'sum_deb_d_oper_1m',
       'cnt_deb_d_oper_1m', 'sum_cred_d_oper_1m', 'cnt_cred_d_oper_1m',
       'sum_deb_e_oper_1m', 'cnt_deb_e_oper_1m', 'cnt_days_deb_e_oper_1m',
       'sum_cred_e_oper_1m', 'cnt_cred_e_oper_1m', 'cnt_days_cred_e_oper_1m',
       'sum_deb_f_oper_1m', 'cnt_deb_f_oper_1m', 'cnt_days_deb_f_oper_1m',
       'sum_cred_f_oper_1m', 'cnt_cred_f_oper_1m', 'cnt_days_cred_f_oper_1m',
       'sum_deb_g_oper_1m', 'cnt_deb_g_oper_1m', 'cnt_days_deb_g_oper_1m',
       'sum_cred_g_oper_1m', 'cnt_cred_g_oper_1m', 'cnt_days_cred_g_oper_1m',
       'sum_deb_h_oper_1m', 'cnt_deb_h_oper_1m', 'cnt_days_deb_h_oper_1m',
       'sum_cred_h_oper_1m', 'cnt_cred_h_oper_1m', 'cnt_days_cred_h_oper_1m',
       'sum_a_oper_3m', 'cnt_a_oper_3m', 'sum_b_oper_3m', 'cnt_b_oper_3m',
       'sum_c_oper_3m', 'cnt_c_oper_3m', 'sum_deb_d_oper_3m',
       'cnt_deb_d_oper_3m', 'sum_cred_d_oper_3m', 'cnt_cred_d_oper_3m',
       'sum_deb_e_oper_3m', 'cnt_deb_e_oper_3m', 'cnt_days_deb_e_oper_3m',
       'sum_cred_e_oper_3m', 'cnt_cred_e_oper_3m', 'cnt_days_cred_e_oper_3m',
       'sum_deb_f_oper_3m', 'cnt_deb_f_oper_3m', 'cnt_days_deb_f_oper_3m',
       'sum_cred_f_oper_3m', 'cnt_cred_f_oper_3m', 'cnt_days_cred_f_oper_3m',
       'sum_deb_g_oper_3m', 'cnt_deb_g_oper_3m', 'cnt_days_deb_g_oper_3m',
       'sum_cred_g_oper_3m', 'cnt_cred_g_oper_3m', 'cnt_days_cred_g_oper_3m',
       'sum_deb_h_oper_3m', 'cnt_deb_h_oper_3m', 'cnt_days_deb_h_oper_3m',
       'sum_cred_h_oper_3m', 'cnt_cred_h_oper_3m', 'cnt_days_cred_h_oper_3m']

In [39]:
for ft in tqdm(vbr_fts):
  #train.loc[(train[ft] < 0) & (train[ft] < train[ft].quantile(0.05)), ft] = 0
  train.loc[train[ft] < 0, ft] = 0
  train.loc[train[ft].isna(), ft] = 0

  #test.loc[(test[ft] < 0) & (test[ft] < test[ft].quantile(0.05)), ft] = 0
  test.loc[test[ft] < 0, ft] = 0
  test.loc[test[ft].isna(), ft] = 0

100%|██████████| 78/78 [00:00<00:00, 280.34it/s]


In [40]:
cat_cols = ['channel_code', 'city_type', 'ogrn_month', 'ogrn_year', 'okved', 'segment']

train[cat_cols] = train[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")


In [None]:
x_train, x_val, y_train, y_val = train_test_split(train.drop(['target_1', 'target_2', 'total_target'], axis=1), train[['target_1', 'target_2', 'total_target']], random_state=42)

In [None]:
y_train_1 = y_train['target_1']
y_train_2 = y_train['target_2']

y_val_1 = y_val['target_1']
y_val_2 = y_val['target_2']
y_val_t = y_val['total_target']

In [None]:
model1 = lgb.LGBMClassifier(boosting_type = 'gbdt', verbose=-1, random_state=42, max_depth=12, n_estimators=700,
                             learning_rate=0.0645, reg_alpha=0.0328, reg_lambda=0.0984)
model1.fit(x_train, y_train_1)

y_pred_1 = model1.predict_proba(x_val)[:, 1]

roc_auc_score(y_val_1, y_pred_1)

In [None]:
model2 = lgb.LGBMClassifier(boosting_type = 'gbdt', verbose=-1, random_state=42, n_estimators = 900,  max_depth = 12 , learning_rate = 0.0401,  reg_alpha = 0.0817,  reg_lambda = 0.086)
model2.fit(x_train, y_train_2)

y_pred_2 = model2.predict_proba(x_val)[:, 1]

roc_auc_score(y_val_2, y_pred_2)

In [None]:
roc_auc_score(y_val_t, y_pred_1 + y_pred_2 - y_pred_1 * y_pred_2)#valid

In [41]:
model1 = lgb.LGBMClassifier(boosting_type = 'gbdt', verbose=-1, random_state=42, max_depth=12, n_estimators=700, learning_rate=0.0645, reg_alpha=0.0328, reg_lambda=0.0984)
model1.fit(train.drop(['target_1', 'target_2', 'total_target'], axis=1), train['target_1'])

In [42]:
model2 = lgb.LGBMClassifier(boosting_type = 'gbdt', verbose=-1, random_state=42, n_estimators = 900,  max_depth = 12 , learning_rate = 0.0401,  reg_alpha = 0.0817,  reg_lambda = 0.086)

model2.fit(train.drop(['target_1', 'target_2', 'total_target'], axis=1), train['target_2'])

In [43]:
y_test_1 = model1.predict_proba(test)[:, 1]
y_test_2 = model2.predict_proba(test)[:, 1]

y_res = y_test_1 + y_test_2 - y_test_1 * y_test_2

In [44]:
sample_submission["score"] = y_res
sample_submission.head()
sample_submission.to_csv("my_submission.csv", index=False)

In [45]:
sample_submission

Unnamed: 0,id,score
0,360000,0.028384
1,360001,0.035093
2,360002,0.131302
3,360003,0.079149
4,360004,0.058963
...,...,...
99995,459995,0.038642
99996,459996,0.039128
99997,459997,0.005996
99998,459998,0.028089
