In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import math
warnings.filterwarnings('ignore')

In [2]:
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [13]:
########################### Model
import lightgbm as lgb

def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
    
    folds = GroupKFold(n_splits=NFOLDS)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  
    split_groups = tr_df['DT_M']

    tt_df = tt_df[['TransactionID',target]]    
    predictions = np.zeros(len(tt_df))
    oof = np.zeros(len(tr_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)
        vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS
        
        oof_preds = estimator.predict(vl_x)
        oof[val_idx] = (oof_preds - oof_preds.min())/(oof_preds.max() - oof_preds.min())

        if LOCAL_TEST:
            feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
            print(feature_imp)
        
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction'] = predictions
    print('OOF AUC:', metrics.roc_auc_score(y, oof))
#     if LOCAL_TEST:
#         print('Holdout AUC:', metrics.roc_auc_score(tt_df[TARGET], tt_df['prediction']))
    
    return tt_df
## -------------------

In [18]:
for a  in test_df['isFraud']:
    if a  is not 0:
        print(a)

In [5]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = True
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [6]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [7]:
########################### DATA LOAD
#################################################################################
print('Load Data')

if LOCAL_TEST:
#     train_df = pd.read_pickle('../Data/ieee-fe-for-local-test/train_df.pkl')
#     test_df = pd.read_pickle('../Data/ieee-fe-for-local-test/test_df.pkl') 
    train_df = pd.read_pickle('../Data/train_df.pkl')
    test_df = pd.read_pickle('../Data/test_df.pkl') 
# else:
#     train_df = pd.read_pickle('../Data/ieee-fe-with-some-eda/train_df.pkl')
#     test_df = pd.read_pickle('../Data/ieee-fe-with-some-eda/test_df.pkl')
    
remove_features = pd.read_pickle('../Data/remove_features.pkl')
remove_features = list(remove_features['features_to_remove'].values)
print('Shape control:', train_df.shape, test_df.shape)

Load Data
Shape control: (590540, 791) (506691, 791)


In [8]:
########################### Final features list
features_columns = [col for col in list(train_df) if col not in remove_features]

########################### Final Minification
## I don't like this part as it changes float numbers
## small change but change.
## To be able to train lgbm without 
## minification we need to do some changes on model
## we will do it later.
if not LOCAL_TEST:
    train_df = reduce_mem_usage(train_df)
    test_df  = reduce_mem_usage(test_df)

In [9]:
########################### Model Train
if LOCAL_TEST:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 10000
    lgb_params['early_stopping_rounds'] = 100
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=4)
else:
    lgb_params['learning_rate'] = 0.007
    lgb_params['n_estimators'] = 10000
    lgb_params['early_stopping_rounds'] = 100    
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=6)

Fold: 0
453219 137321
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.976001	valid_1's auc: 0.891099
[400]	training's auc: 0.995	valid_1's auc: 0.905092
[600]	training's auc: 0.998932	valid_1's auc: 0.909981
[800]	training's auc: 0.999767	valid_1's auc: 0.913811
[1000]	training's auc: 0.999951	valid_1's auc: 0.915939
[1200]	training's auc: 0.999992	valid_1's auc: 0.917657
[1400]	training's auc: 0.999999	valid_1's auc: 0.918488
Early stopping, best iteration is:
[1381]	training's auc: 0.999999	valid_1's auc: 0.91859
     Value                        Feature
0        0                      D9_not_na
1        0                           V107
2        0                           V117
3        0                           V119
4        0                           V120
5        0                           V240
6        0                            V27
7        0                            V28
8        0                           V305
9        0          

Fold: 3
415193 175347
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.979669	valid_1's auc: 0.929887
[400]	training's auc: 0.996587	valid_1's auc: 0.941481
[600]	training's auc: 0.999423	valid_1's auc: 0.945125
[800]	training's auc: 0.999895	valid_1's auc: 0.946275
[1000]	training's auc: 0.999984	valid_1's auc: 0.94661
[1200]	training's auc: 0.999998	valid_1's auc: 0.946658
Early stopping, best iteration is:
[1133]	training's auc: 0.999996	valid_1's auc: 0.946752
     Value                        Feature
0        0                           V107
1        0                           V117
2        0                           V118
3        0                           V119
4        0                           V122
5        0                           V240
6        0                           V241
7        0                            V27
8        0                            V28
9        0                           V305
10       0                     

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
########################### Export
if not LOCAL_TEST:
    test_predictions['isFraud'] = test_predictions['prediction']
    test_predictions[['TransactionID','isFraud']].to_csv('submission.csv', index=False)

In [12]:
from sklearn.metrics import roc_auc_score
accuracy = roc_auc_score(test_df, test_predictions)
print("AUC: %.2f%%" % (accuracy * 100.0))

NameError: name 'test_predictions' is not defined