In [1]:
from numba import jit

import lightgbm as lgb
import pandas as pd
import numpy as np

from datetime import datetime
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [2]:
random_state = 42
np.random.seed(random_state)
df_train = pd.read_csv('data/train.csv').drop("ID_code",axis=1)
df_test = pd.read_csv('data/test.csv')

In [3]:
df_train.shape, df_test.shape

((200000, 201), (200000, 201))

In [4]:
df_train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [5]:
df_train.head()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [6]:
param = {
   "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : -1,
    "num_leaves" : 13,
    "learning_rate" : 0.01,
    "bagging_freq": 5,
    "bagging_fraction" : 0.4,
    "feature_fraction" : 0.05,
    "min_data_in_leaf": 80,
    "min_sum_heassian_in_leaf": 10,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : 10,
    "verbosity" : 1,
}

In [7]:
# X_train = df_train.copy()
# X_train.drop('target', axis=1, inplace=True)

In [8]:
# X_train['new_f'] = X_train.abs().sum(axis=1) - X_train.sum(axis=1)

In [9]:
cat_features = ['var_12','var_13','var_108','var_126','var_68']
features = [col for col in df_train.columns if col not in ['target', 'ID_code']]

X, y = df_train[features], df_train.target
X_kaggle_test = df_test[features]

In [10]:
def decode_cat_features(data, cat_features):
    for f in cat_features:
        hist, bin_edges = np.histogram(data[f].values, bins=1000, density=True)
        data['test_' + f] = [hist[np.searchsorted(bin_edges, elem) - 1] for elem in data[f].values]
        
# hardcoded these features. took them from feature importance below
# https://www.kaggle.com/allunia/santander-customer-transaction-eda
most_important = ['var_34', 'var_80', 'var_6', 'var_166', 'var_122']
def add_bins(data):
    encoder = LabelEncoder()
    for feature in most_important:
        data[feature + "_qbinned"] = pd.qcut(
            X.loc[:, feature].values,
            q=10,
            labels=False
        )
        data[feature + "_qbinned"] = encoder.fit_transform(
            data[feature + "_qbinned"].values.reshape(-1, 1)
        )

In [11]:
decode_cat_features(X, cat_features)
decode_cat_features(X_kaggle_test, cat_features)

In [12]:
X.shape, X_kaggle_test.shape

((200000, 205), (200000, 205))

threw out shit with kfold and augmentation

https://www.kaggle.com/c/santander-customer-transaction-prediction/discussion/87815

says that it's not really needed

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.50, random_state=random_state)

In [14]:
train_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val)

model = lgb.train(param, train_dataset, 10000, valid_sets=[val_dataset], verbose_eval=200)

[200]	valid_0's auc: 0.864223
[400]	valid_0's auc: 0.873716
[600]	valid_0's auc: 0.880898
[800]	valid_0's auc: 0.885154
[1000]	valid_0's auc: 0.88831
[1200]	valid_0's auc: 0.890892
[1400]	valid_0's auc: 0.892435
[1600]	valid_0's auc: 0.894069
[1800]	valid_0's auc: 0.895379
[2000]	valid_0's auc: 0.896788
[2200]	valid_0's auc: 0.897635
[2400]	valid_0's auc: 0.898444
[2600]	valid_0's auc: 0.899232
[2800]	valid_0's auc: 0.899945
[3000]	valid_0's auc: 0.900582
[3200]	valid_0's auc: 0.900959
[3400]	valid_0's auc: 0.901437
[3600]	valid_0's auc: 0.901891
[3800]	valid_0's auc: 0.902209
[4000]	valid_0's auc: 0.902365
[4200]	valid_0's auc: 0.902594
[4400]	valid_0's auc: 0.902969
[4600]	valid_0's auc: 0.903325
[4800]	valid_0's auc: 0.903437
[5000]	valid_0's auc: 0.903694
[5200]	valid_0's auc: 0.903851
[5400]	valid_0's auc: 0.903874
[5600]	valid_0's auc: 0.90394
[5800]	valid_0's auc: 0.904021
[6000]	valid_0's auc: 0.904221
[6200]	valid_0's auc: 0.904317
[6400]	valid_0's auc: 0.904378
[6600]	valid_0

In [16]:
df_importance = pd.DataFrame()
df_importance['feature'] = X.columns
df_importance['importance'] = model.feature_importance()

In [17]:
df_importance.sort_values(ascending=False, by='importance')

Unnamed: 0,feature,importance
34,var_34,860
80,var_80,822
6,var_6,813
166,var_166,805
122,var_122,801
91,var_91,797
184,var_184,796
198,var_198,794
170,var_170,793
118,var_118,787


In [15]:
# last submitted 
y_test_pred = model.predict(X_test)
score = roc_auc_score(y_test, y_test_pred)
print('AUC: ', score)

AUC:  0.8957612797762086


In [24]:
sub_df = pd.DataFrame({'ID_code': df_test['ID_code'].values})
sub_df['target'] = model.predict(X_kaggle_test)
sub_df.to_csv('submission.csv', index=False)