# Module

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")
gc.enable()

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

# Load Data

In [3]:
train_raw = pd.read_csv('./data/train.csv')
test_raw = pd.read_csv('./data/test.csv')
train_raw.shape, test_raw.shape

((200000, 202), (200000, 201))

In [164]:
train = train_raw.copy()
test = test_raw.copy()

In [5]:
train0 = train[ train['target']==0 ].copy()
train1 = train[ train['target']==1 ].copy()
train.sample(5)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
177237,train_177237,0,9.3811,1.3678,7.9887,5.9687,12.5024,-5.6772,5.2422,20.4304,2.1443,6.837,-6.1864,-1.1473,13.8688,0.9797,9.4169,14.7743,7.8962,-3.2608,12.3021,14.3695,20.3372,16.6512,1.6364,3.8534,11.1426,13.2243,-1.768,-2.8332,4.8337,6.955,-10.5375,11.2839,-0.7732,18.4724,11.2701,1.3675,1.6925,5.2468,2.5665,1.1348,-14.1028,3.442,11.5132,11.3591,-1.2088,11.229,12.088,-16.8586,29.4764,21.5891,13.8888,8.3871,0.1732,4.796,9.7293,12.382,15.2284,6.5115,2.8647,9.0329,15.8246,-33.4016,5.0253,2.6751,4.2565,-0.06,3.3099,8.0869,5.0223,-3.7772,22.79,0.4738,6.1375,9.6934,28.2765,24.2438,2.7925,16.3437,6.5779,16.9712,7.357,17.4094,-8.3816,-9.4203,-3.3386,15.1071,-0.2209,3.3967,9.1284,1.5394,5.3659,6.7171,13.5403,10.6378,13.9057,-0.1611,30.6042,2.6539,1.4874,0.9309,0.4074,13.6118,17.3738,1.473,11.6969,5.3911,11.6355,27.0164,14.1988,22.4322,8.4323,6.7468,0.1583,5.3917,3.9421,4.5204,2.2118,33.6608,-1.3677,-5.6704,18.2121,10.0408,-1.4836,3.5277,5.1419,12.4466,12.3212,3.7952,-5.8084,14.1046,11.5217,0.0441,9.3776,6.9517,-7.4998,-2.2724,19.5037,27.1137,6.223,8.2332,-3.5178,-5.1233,11.1215,10.5684,9.2397,3.7465,12.4788,-0.2339,3.9098,16.4596,14.7072,9.0664,11.1435,16.0975,12.7579,8.2395,12.5452,-12.0623,11.4891,12.1978,15.8651,5.6163,6.7808,4.8678,3.3076,17.8241,3.6387,-21.8565,4.5786,6.0398,-5.3861,-5.7064,25.214,-5.0795,30.2052,9.253,-2.6822,15.4964,7.0361,2.6377,-3.9438,13.2748,-13.5337,8.2103,13.2771,-0.2337,10.5017,-9.5809,13.2629,1.1496,2.0603,6.7863,1.6866,7.0461,13.7652,0.0143,-4.8263,8.7813,17.0136,5.246
60100,train_60100,0,12.3952,-2.8002,14.4891,5.2719,13.6998,-5.6947,4.337,20.6238,1.3433,7.4459,-7.5891,-10.9592,14.2336,5.7552,4.0649,14.5716,12.4727,3.4669,14.6293,6.042,23.7248,20.6798,7.9933,3.4505,4.5901,13.5542,-12.7179,-1.6273,6.7098,9.1056,-3.5407,9.3072,2.1486,10.7695,10.9536,9.5526,4.6284,4.5931,3.671,-3.4935,-12.7281,17.4861,11.7403,11.4425,6.0693,2.813,13.4499,-12.1517,11.3128,20.2424,12.1422,16.2732,-4.5102,6.9968,13.0833,21.1266,13.9522,5.9838,-2.0367,8.8203,13.1203,-39.974,-2.5199,-0.1928,6.4197,2.6674,3.8707,18.3227,5.0138,-7.7431,13.2331,0.4136,-2.9785,17.4276,36.9191,21.2703,-7.8307,16.7292,7.0701,12.8997,13.86,15.0957,-19.9131,7.4368,-7.1183,24.1936,-2.2878,17.431,11.3709,6.5586,-10.8496,6.9136,11.5864,10.7142,11.3217,-0.0151,3.0395,11.4233,2.7061,-4.3734,-12.7607,8.6737,3.9356,1.8243,14.5606,4.9934,9.1949,17.6527,14.4828,19.9369,8.8818,7.5522,3.5038,4.1261,1.5318,1.8615,-0.5492,6.2992,1.8508,5.9397,42.9297,11.2104,6.7792,-5.7144,6.2168,11.9833,13.7505,-5.7912,-2.421,13.197,11.1701,0.5601,7.1501,6.8472,-9.295,-7.0644,25.3016,33.1706,0.8888,2.1123,3.8173,-4.4909,1.7359,12.5837,7.9876,5.9664,8.6334,-6.3044,4.0611,2.6777,14.2281,13.6531,5.9746,17.6156,0.4119,-17.7279,12.8745,-5.304,1.0435,7.818,18.4717,5.9082,6.1277,5.9469,-0.0435,19.4908,2.6998,-5.2924,6.3969,5.7371,2.9567,-4.5165,24.7107,5.2935,10.6363,11.0373,-5.1547,11.1604,-18.453,-0.9904,0.6233,11.9498,-7.626,7.4492,19.2274,1.8937,7.1195,-7.2721,15.6093,0.4289,3.7103,9.2301,1.0425,6.8414,20.7672,0.2377,6.8671,7.1934,15.0918,1.9894
10343,train_10343,1,6.314,-8.5634,9.6085,11.3396,12.9154,-1.9232,5.2211,16.3403,-5.0617,8.8435,-2.0555,-6.5454,13.979,13.9577,4.5518,15.1843,9.5862,3.593,14.1078,11.2497,18.1818,26.5681,-2.9368,3.8959,5.5538,13.4675,-6.4892,-3.8006,6.1027,4.5469,-19.1576,7.9191,-0.8542,15.4412,11.8134,-5.0101,6.129,6.7166,10.4297,1.7857,-18.9352,15.6498,12.3676,11.2843,11.8958,5.9467,8.5521,-11.9119,31.3369,12.9085,12.7641,23.7739,2.0627,6.0313,-0.6042,17.3276,19.25,5.833,-3.7269,9.0286,6.3561,-4.9143,-3.2131,-4.6012,7.8008,4.0142,3.7608,9.6023,5.0302,0.7518,37.6594,0.1716,0.301,24.5192,26.2932,15.4787,-6.4192,25.8851,4.3591,11.742,-8.8026,9.2143,10.4234,18.9704,-11.0132,17.6659,1.7697,3.3556,8.8649,5.2898,-22.2099,6.9794,6.0131,9.8419,11.0327,-0.1449,27.8224,23.0741,3.2826,-1.0936,4.7081,10.7713,8.3677,1.464,10.6289,5.2974,10.2414,16.0079,14.3697,16.1371,4.268,8.5113,4.2995,6.3365,3.713,6.4894,2.3991,31.1829,-6.7859,9.441,18.7955,12.339,-3.4047,5.8514,0.2227,11.9083,13.2718,4.3251,0.4136,10.92,12.4686,0.7957,8.0519,6.0468,-1.7854,-9.4117,35.7738,28.6997,1.8509,14.8878,5.6094,-3.276,7.1542,11.9722,8.0513,10.6481,7.6166,-19.0912,3.9455,-6.2348,19.8995,10.9001,8.8567,16.5963,4.623,3.6797,11.6345,-5.4866,13.3895,6.9753,21.6936,5.8024,5.563,21.6847,4.1127,17.3072,2.6084,4.2128,8.2844,4.9074,0.5054,0.0559,16.8594,-1.2751,7.1187,11.7563,4.1121,10.42,-9.624,7.8866,-10.6318,11.9302,10.8261,9.5196,35.5002,-0.7699,9.756,-16.8216,20.0458,-0.5473,9.1461,4.0181,1.0001,2.2082,16.8206,-0.722,-0.2926,7.733,11.8807,-6.6614
43816,train_43816,0,8.8281,-0.7747,11.0467,4.6426,9.2192,-1.5627,5.9678,20.0457,-5.8581,7.7887,-3.3674,1.8319,14.2285,19.0141,10.2263,15.279,6.7378,-17.6937,30.4109,4.7195,21.817,11.7641,2.9819,3.3704,14.4744,14.1219,-1.1318,-0.7836,4.2702,4.8103,-1.3734,7.6935,-7.3548,17.9702,11.5961,1.123,4.866,5.3349,2.8283,-2.8251,-3.5487,2.6525,10.9708,11.1621,13.9307,-20.4479,14.2722,-25.8693,31.6283,6.573,13.0088,4.3759,-1.2525,4.9431,4.6478,25.4234,21.3078,6.1863,0.3188,8.1523,19.026,2.314,-1.2155,-1.1546,5.2874,3.4867,6.3395,23.1689,5.0221,-3.961,29.4122,0.7434,5.0888,22.2755,-1.0838,19.8909,-14.3835,12.6967,4.9231,14.6569,-1.772,17.4606,8.2002,18.2545,-7.8917,22.4068,-11.0895,11.5717,12.2427,8.7827,-19.2604,6.9875,5.286,9.9558,10.4062,-0.22,-4.3814,32.0397,2.2522,-0.823,-6.26,18.9625,13.3699,1.4586,13.421,4.5342,8.9459,18.4333,14.2752,14.3962,9.9689,7.5015,3.6897,5.4486,2.4006,-1.4458,1.8603,18.7327,-9.4307,-5.4038,41.683,15.3709,10.1438,10.1673,4.9908,13.0609,13.0712,2.778,-1.717,22.3565,14.096,0.0531,6.882,7.6244,-3.0493,0.8734,42.0034,10.3409,0.6531,2.033,-4.4241,-8.1846,13.9825,9.4603,7.9614,4.2397,8.9959,-8.6071,4.1426,11.9089,14.2767,13.8419,13.0754,15.5879,5.3864,0.5692,12.7278,-6.8266,13.8611,8.0789,34.5913,5.3271,3.9877,15.6563,-11.3221,30.2308,2.9696,-10.214,6.0257,5.2697,5.6099,-8.0048,8.5369,-9.4076,25.5643,11.6989,0.6765,10.7474,-3.5622,3.3349,-9.3234,11.8524,-4.2284,8.7364,18.6082,-8.9092,5.7894,-18.2385,14.1815,1.7691,8.6579,6.2335,4.5419,2.9977,21.4753,0.1262,8.4051,9.4259,16.2165,7.2266
1722,train_1722,0,10.1239,-1.3317,9.7215,7.1939,11.698,-11.3303,4.7702,16.8994,2.5855,6.5715,-1.0415,-7.4379,14.0404,11.9148,5.6423,14.635,9.5473,-2.7747,24.7363,10.1358,17.0709,19.9171,0.3523,3.1183,9.638,13.4238,-17.9752,-2.0764,5.5773,6.7367,-13.5644,14.6829,-1.3904,9.793,11.5718,-2.7092,1.3146,8.387,19.7091,8.6801,-16.4643,3.6845,10.4846,11.3591,-6.8102,-22.9209,12.1108,-2.2425,27.4414,22.878,13.5257,13.2384,-2.5031,6.891,8.5387,12.7164,10.5823,7.5836,1.0853,8.5841,9.8467,-2.8977,1.9212,4.1544,7.1817,4.1147,6.696,6.2269,5.0119,2.0481,9.6862,1.2857,4.4161,8.3893,27.4635,12.5358,5.4815,19.6013,5.9627,13.4463,-4.2764,17.5598,-6.4073,-7.1781,-12.0808,21.5329,0.3468,24.2314,4.9141,3.1357,-14.9354,7.1295,16.7336,9.9325,11.01,0.0489,17.0916,8.2821,2.125,-1.0631,-4.7511,5.5163,24.0408,1.5599,9.8793,4.8523,10.9384,24.1758,14.1088,10.8819,8.0464,6.9157,3.3383,7.7886,0.9085,5.0631,2.3984,27.8761,-17.3971,-5.0124,40.3981,13.5141,2.0673,-1.0802,4.491,12.2148,13.1854,5.9476,1.1063,18.9738,11.6427,1.0471,6.3885,6.2175,-15.154,-0.3194,17.0253,27.0742,-4.0489,17.924,8.4432,1.8606,26.6935,8.1799,10.2984,1.772,11.248,-12.7773,3.868,2.6426,16.4878,3.0446,6.4519,16.6077,-3.9582,-1.5192,13.5056,0.4765,16.1446,3.7324,19.9127,5.8444,8.1077,14.9542,-10.049,21.3192,2.7254,5.8334,-2.3985,6.1445,0.9971,-5.1628,14.2962,-0.5447,9.3616,17.6425,-1.1391,9.55,-6.271,1.121,-13.2904,11.75,-8.4715,13.251,14.436,-0.3437,11.7082,-15.6968,13.1597,0.9788,0.0454,6.6937,1.8491,2.9108,22.831,0.3223,5.2488,9.0571,18.5584,-15.0114


In [165]:
col_list = train.columns[2:]

In [6]:
# CALCULATE MEANS AND STANDARD DEVIATIONS
s = [0]*200
m = [0]*200
for i in range(200):
    s[i] = np.std(train['var_'+str(i)])
    m[i] = np.mean(train['var_'+str(i)])
    
# CALCULATE PROB(TARGET=1 | X)
def getp(i,x):
    c = 3 #smoothing factor
    a = len( train1[ (train1['var_'+str(i)]>x-s[i]/c)&(train1['var_'+str(i)]<x+s[i]/c) ] ) 
    b = len( train0[ (train0['var_'+str(i)]>x-s[i]/c)&(train0['var_'+str(i)]<x+s[i]/c) ] )
    if a+b<500: return 0.1 #smoothing factor
    # RETURN PROBABILITY
    return a / (a+b)
    # ALTERNATIVELY RETURN ODDS
    # return a / b
    
# SMOOTH A DISCRETE FUNCTION
def smooth(x,st=1):
    for j in range(st):
        x2 = np.ones(len(x)) * 0.1
        for i in range(len(x)-2):
            x2[i+1] = 0.25*x[i]+0.5*x[i+1]+0.25*x[i+2]
        x = x2.copy()
    return x

In [17]:
# DRAW PLOTS, YES OR NO
Picture = False
# DATA HAS Z-SCORE RANGE OF -4.5 TO 4.5
rmin=-5; rmax=5; 
# CALCULATE PROBABILITIES FOR 501 BINS
res=501
# STORE PROBABILITIES IN PR
pr = 0.1 * np.ones((200,res))
pr2 = pr.copy()
xr = np.zeros((200,res))
xr2 = xr.copy()
ct2 = 0
for j in tqdm(range(50)):
    if Picture: plt.figure(figsize=(15,8))
    for v in range(4):
        ct = 0
        # CALCULATE PROBABILITY FUNCTION FOR VAR
        for i in np.linspace(rmin,rmax,res):
            pr[v+4*j,ct] = getp(v+4*j,m[v+4*j]+i*s[v+4*j])
            xr[v+4*j,ct] = m[v+4*j]+i*s[v+4*j]
            xr2[v+4*j,ct] = i
            ct += 1
        if Picture:
            # SMOOTH FUNCTION FOR PRETTIER DISPLAY
            # BUT USE UNSMOOTHED FUNCTION FOR PREDICTION
            pr2[v+4*j,:] = smooth(pr[v+4*j,:],res//10)
            # DISPLAY PROBABILITY FUNCTION
            plt.subplot(2, 4, ct2%4+5)
            plt.plot(xr[v+4*j,:],pr2[v+4*j,:],'-')
            plt.title('P( t=1 | var_'+str(v+4*j)+' )')
            xx = plt.xlim()
            # DISPLAY TARGET DENSITIES
            plt.subplot(2, 4, ct2%4+1)            
            sns.distplot(train0['var_'+str(v+4*j)], label = 't=0')
            sns.distplot(train1['var_'+str(v+4*j)], label = 't=1')
            plt.title('var_'+str(v+4*j))
            plt.legend()
            plt.xlim(xx)
            plt.xlabel('')
        ct2 += 1
    if Picture: plt.show()

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [127]:
def get_pred(data):

    on = np.nonzero(xr[int(col.split('_')[-1])] > data)[0][0]
    pred = (pr[int(col[-1])][on] + pr[int(col[-1])][on + 1])/2

    return pred

In [131]:
prob_dic = {}

for col in tqdm(col_list):
    


    temp = train[col].value_counts().to_frame()

    temp['pred'] = temp.index.map(get_pred)
    temp = np.around(temp, 5)
    del temp[col]

    temp = temp.to_dict()['pred']
    
    prob_dic[col] = temp

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [166]:
pb_idx = np.load('./data_temp/public_LB.npy')
pv_idx = np.load('./data_temp/private_LB.npy')

In [167]:
test_pb = test.iloc[pb_idx].sort_index().copy()
test_pv = test.iloc[pv_idx].sort_index().copy()

test_real = test_pb.append(test_pv)

In [168]:
data = train.append(test_real)

In [138]:
unique_df = data[['ID_code']]

In [139]:
for col in tqdm(col_list):
    unique_df[col] = data[col].map(((data[col].value_counts() == 1) * 1).to_dict())

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [174]:
# unique_data = data[['ID_code', 'target']]
for col in tqdm(col_list):
    data[col + '_unique'] = np.around(data[col] * unique_df[col], 4)
#     unique_data[col] = np.around(data[col] * unique_df[col], 4)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [176]:
for col in tqdm(col_list):
    data[col + '_prob'] = data[col + '_unique'].map(prob_dic[col])

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [179]:
train = data[~data.target.isna()]
test = data[data.target.isna()]

In [180]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [183]:
target = train['target']

In [184]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average': False,
    'boost': 'gbdt',
    'feature_fraction_seed': 47,
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'num_threads': 8
}

In [185]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [c for c in train.columns if c not in ['ID_code', 'target']]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target.values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 30000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=400, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    print("CV score: {:<8.5f}".format(roc_auc_score(target.values[val_idx], oof_lgb[val_idx])))
    
print("CV score: {:<8.5f}".format(roc_auc_score(target.values, oof_lgb)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[400]	training's auc: 0.890433	valid_1's auc: 0.872906
[800]	training's auc: 0.901113	valid_1's auc: 0.882236
[1200]	training's auc: 0.907121	valid_1's auc: 0.886424
[1600]	training's auc: 0.912618	valid_1's auc: 0.890582
[2000]	training's auc: 0.916292	valid_1's auc: 0.892593
[2400]	training's auc: 0.919965	valid_1's auc: 0.894464
[2800]	training's auc: 0.922948	valid_1's auc: 0.895916
[3200]	training's auc: 0.925863	valid_1's auc: 0.897298
[3600]	training's auc: 0.9286	valid_1's auc: 0.898492
[4000]	training's auc: 0.931252	valid_1's auc: 0.89941
[4400]	training's auc: 0.933689	valid_1's auc: 0.900249
[4800]	training's auc: 0.936123	valid_1's auc: 0.901017
[5200]	training's auc: 0.93854	valid_1's auc: 0.902014
[5600]	training's auc: 0.940774	valid_1's auc: 0.902838
[6000]	training's auc: 0.942989	valid_1's auc: 0.903607
[6400]	training's auc: 0.945074	valid_1's auc: 0.904235
[6800]	training's auc: 0.947184	valid_