In [1]:
import numpy as np
import pandas as pd

In [2]:
import pandas_profiling

In [3]:
import gc

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
from sklearn.model_selection import train_test_split

## Read data

In [6]:
print('Loading data...')

train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_identity!')

train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_transaction!')

test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_identity!')

test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_transaction!')

print('Data was successfully loaded!\n')

Loading data...
	Successfully loaded train_identity!
	Successfully loaded train_transaction!
	Successfully loaded test_identity!
	Successfully loaded test_transaction!
Data was successfully loaded!



In [7]:
sub = pd.read_csv('sample_submission.csv')
print('\tSuccessfully loaded sample_submission!')

	Successfully loaded sample_submission!


## Usefull methods

In [8]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')
    
    return df

In [9]:
def id_split(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
    dataframe['version_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[1]

    dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
    dataframe['version_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[1]

    dataframe['screen_width'] = dataframe['id_33'].str.split('x', expand=True)[0]
    dataframe['screen_height'] = dataframe['id_33'].str.split('x', expand=True)[1]

    # dataframe['id_34'] = dataframe['id_34'].str.split(':', expand=True)[1]
    dataframe['id_23'] = dataframe['id_23'].str.split(':', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(
        dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 
                  'device_name'] = "Others"
    dataframe['had_id'] = 1
    gc.collect()
    
    return dataframe

## Prepare and merge data

In [10]:
train_identity = id_split(train_identity)
test_identity = id_split(test_identity)

In [11]:
print('Merging data...')
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print('Data was successfully merged!\n')

del train_identity, train_transaction, test_identity, test_transaction

print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

gc.collect()

Merging data...
Data was successfully merged!

Train dataset has 590540 rows and 442 columns.
Test dataset has 506691 rows and 441 columns.


0

## Feature selection

In [12]:
useful_features = ['TransactionAmt', 'ProductCD', 
                   'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 
                   'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 
                   'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 
                   'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 
                   'V10', 'V11', 'V12', 'V13', 'V17',
                   'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 
                   'V45', 'V46', 'V47', 'V48',
                   'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 
                   'V64', 'V69', 'V70', 'V71',
                   'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 
                   'V87', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 
                   'V131', 'V138', 'V139', 'V140',
                   'V143', 'V145', 'V146', 'V147', 'V149', 'V150', 'V151', 'V152', 'V154', 'V156', 
                   'V158', 'V159', 'V160', 'V161',
                   'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V172', 
                   'V173', 'V175', 'V176', 'V177',
                   'V178', 'V180', 'V182', 'V184', 'V187', 'V188', 'V189', 'V195', 'V197', 'V200', 
                   'V201', 'V202', 'V203', 'V204',
                   'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 
                   'V216', 'V217', 'V219', 'V220',
                   'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V231', 
                   'V233', 'V234', 'V238', 'V239',
                   'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V249', 'V251', 'V253', 'V256', 
                   'V257', 'V258', 'V259', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 
                   'V273', 'V274', 'V275', 'V276',
                   'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 
                   'V291', 'V292', 'V294', 'V303',
                   'V304', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 
                   'V322', 'V323', 'V324', 'V326',
                   'V329', 'V331', 'V332', 'V333', 'V335', 'V336', 'V338', 'id_01', 'id_02', 'id_03', 'id_05', 
                   'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 
                   'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'device_name', 
                   'device_version', 'OS_id_30', 'version_id_30',
                   'browser_id_31', 'version_id_31', 'screen_width', 'screen_height', 'had_id']

In [13]:
cols_to_drop = [col for col in train.columns if col not in useful_features]
cols_to_drop.remove('isFraud')
cols_to_drop.remove('TransactionDT')

In [14]:
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

## Feature engineering

In [15]:
columns_a = ['TransactionAmt', 'id_02', 'D15']
columns_b = ['card1', 'card4', 'addr1']

for col_a in columns_a:
    for col_b in columns_b:
        for df in [train, test]:
            df[f'{col_a}_to_mean_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('mean')
            df[f'{col_a}_to_std_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('std')

In [16]:
# New feature - log of transaction amount.
train['TransactionAmt_Log'] = np.log(train['TransactionAmt'])
test['TransactionAmt_Log'] = np.log(test['TransactionAmt'])

# New feature - decimal part of the transaction amount.
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - 
                                    train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - 
                                   test['TransactionAmt'].astype(int)) * 1000).astype(int)

# New feature - day of week in which a transaction happened.
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)

# New feature - hour of the day in which a transaction happened.
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24

# Some arbitrary features interaction
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))

# Encoding - count encoding for both train and test
for feature in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'id_36']:
    train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], 
                                                                  ignore_index=True).value_counts(dropna=False))
    test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], 
                                                                ignore_index=True).value_counts(dropna=False))

# Encoding - count encoding separately for train and test
for feature in ['id_01', 'id_31', 'id_33', 'id_36']:
    train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
    test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

### Enriching with email characteristics

In [17]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 
          'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 
          'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 
          'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 
          'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 
          'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 
          'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 
          'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 
          'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 
          'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 
          'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 
          'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 
          'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 
          'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

### Email domain Alexa ranking

In [32]:
from lxml import html
import urllib.request
x_path_rank = '/html/body/div/div/table/tbody/tr/td[1]/div[1]/a'
x_path_traffic_rank = '/html/body/div/div/table/tbody/tr/td[2]/div[1]/a'
x_path_sites = '/html/body/div/div/table/tbody/tr/td[3]/div[1]/a'

alexa_cache = {}

In [38]:
def rank_domain(domain):
    if domain in ['', None, np.nan]:
        return (None, None, None)
    
    if domain == 'gmail':
        domain = 'gmail.com'
    
    if domain in alexa_cache:
        return alexa_cache[domain]
    
    with urllib.request.urlopen(f"https://www.alexa.com/minisiteinfo/{domain}") as url:
        print(f'Querying Alexa for ranking of {domain}.')
        
        s = url.read()
        tree = html.fromstring(s)
        try:
            rank = int(tree.xpath(x_path_rank)[0].text_content().replace(',', ''))
        except IndexError:
            print(f'No rank for domain {domain}...')
            rank = None
        except ValueError:
            rank = 0
            
        try:
            traffic = int(tree.xpath(x_path_traffic_rank)[0].text_content().replace(',', ''))
        except IndexError:
            print(f'No rank for domain {domain}...')
            traffic = None
        except ValueError:
            traffic = 0
            
        try:
            sites = int(tree.xpath(x_path_sites)[0].text_content().replace(',', ''))
        except IndexError:
            print(f'No rank for domain {domain}...')
            sites = None       
        except ValueError:
            sites = 0
        
        alexa_cache[domain] = (rank, traffic, sites)
    
        return rank, traffic, sites

In [39]:
for c in ['P_emaildomain', 'R_emaildomain']:
    train[c + '_rank'] = train[c].apply(lambda x: rank_domain(x)[0])
    test[c + '_rank'] = test[c].apply(lambda x: rank_domain(x)[0])
    
    train[c + '_traffic_rank'] = train[c].apply(lambda x: rank_domain(x)[1])
    test[c + '_traffic_rank'] = test[c].apply(lambda x: rank_domain(x)[1])
    
    train[c + '_sites'] = train[c].apply(lambda x: rank_domain(x)[2])
    test[c + '_sites'] = test[c].apply(lambda x: rank_domain(x)[2])

Querying Alexa for ranking of servicios-ta.com.
No rank for domain servicios-ta.com...
No rank for domain servicios-ta.com...
Querying Alexa for ranking of earthlink.net.
Querying Alexa for ranking of hotmail.es.
No rank for domain hotmail.es...
Querying Alexa for ranking of cfl.rr.com.
Querying Alexa for ranking of roadrunner.com.
Querying Alexa for ranking of netzero.net.
Querying Alexa for ranking of gmx.de.
No rank for domain gmx.de...
Querying Alexa for ranking of suddenlink.net.
Querying Alexa for ranking of frontiernet.net.
Querying Alexa for ranking of windstream.net.
Querying Alexa for ranking of frontier.com.
Querying Alexa for ranking of outlook.es.
No rank for domain outlook.es...
Querying Alexa for ranking of mac.com.
Querying Alexa for ranking of netzero.com.
No rank for domain netzero.com...
Querying Alexa for ranking of aim.com.
Querying Alexa for ranking of web.de.
Querying Alexa for ranking of twc.com.
No rank for domain twc.com...
Querying Alexa for ranking of cableo

### Other enriching

In [42]:
# https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499

for c in ['P_emaildomain', 'R_emaildomain']:
    train[c + '_bin'] = train[c].map(emails)
    test[c + '_bin'] = test[c].map(emails)
    
    train[c + '_suffix'] = train[c].map(lambda x: str(x).split('.')[-1])
    test[c + '_suffix'] = test[c].map(lambda x: str(x).split('.')[-1])
    
    train[c + '_suffix'] = train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test[c + '_suffix'] = test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [43]:
for col in train.columns:
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

In [45]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 1569.88 MB
Memory usage after optimization is: 464.91 MB
Decreased by 70.4%
Memory usage of dataframe is 1345.95 MB
Memory usage after optimization is: 411.89 MB
Decreased by 69.4%


In [46]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

X_test = test.drop(['TransactionDT'], axis=1)

del train, test
gc.collect()

6459

## Sample input

In [47]:
X.loc[2987002].to_dict()

{'TransactionAmt': 59.0,
 'ProductCD': 4.0,
 'card1': 4663.0,
 'card2': 490.0,
 'card3': 150.0,
 'card4': 4.0,
 'card5': 166.0,
 'card6': 2.0,
 'addr1': 330.0,
 'addr2': 87.0,
 'dist1': 287.0,
 'P_emaildomain': 36.0,
 'R_emaildomain': 32.0,
 'C1': 1.0,
 'C2': 1.0,
 'C4': 0.0,
 'C5': 0.0,
 'C6': 1.0,
 'C7': 0.0,
 'C8': 0.0,
 'C9': 1.0,
 'C10': 0.0,
 'C11': 1.0,
 'C12': 0.0,
 'C13': 1.0,
 'C14': 1.0,
 'D1': 0.0,
 'D2': nan,
 'D3': nan,
 'D4': 0.0,
 'D5': nan,
 'D6': nan,
 'D8': nan,
 'D9': nan,
 'D10': 0.0,
 'D11': 315.0,
 'D12': nan,
 'D13': nan,
 'D14': nan,
 'D15': 315.0,
 'M2': 1.0,
 'M3': 1.0,
 'M4': 0.0,
 'M5': 0.0,
 'M6': 0.0,
 'M7': 0.0,
 'M8': 0.0,
 'M9': 0.0,
 'V3': 1.0,
 'V4': 1.0,
 'V5': 1.0,
 'V6': 1.0,
 'V7': 1.0,
 'V8': 1.0,
 'V9': 1.0,
 'V10': 0.0,
 'V11': 0.0,
 'V12': 1.0,
 'V13': 1.0,
 'V17': 0.0,
 'V19': 1.0,
 'V20': 1.0,
 'V29': 0.0,
 'V30': 0.0,
 'V33': 0.0,
 'V34': 0.0,
 'V35': 1.0,
 'V36': 1.0,
 'V37': 1.0,
 'V38': 1.0,
 'V40': 0.0,
 'V44': 1.0,
 'V45': 1.0,
 'V46'

## AutoML solution

In [None]:
# tX_train, tX_test, ty_train, ty_test = train_test_split(X.reset_index(drop=True), 
#                                                         y,
#                                                         train_size=0.8, test_size=0.20,
#                                                         stratify=y)

# tpot = TPOTClassifier(generations=6, population_size=25, 
#                       verbosity=2, 
#                       scoring='roc',
#                       n_jobs=11)

In [None]:
# tX_train = tX_train.reset_index(drop=True).replace([np.inf, -np.inf], np.nan).fillna(0)
# tX_test = tX_test.reset_index(drop=True).replace([np.inf, -np.inf], np.nan).fillna(0)

In [None]:
# np.any(np.isnan(tX_train)), np.all(np.isfinite(tX_train))

In [None]:
# tpot.fit(tX_train, ty_train)

In [None]:
# print(tpot.score(X_test, y_test))

## LightGBM

In [48]:
from sklearn.metrics import roc_auc_score

In [49]:
from sklearn.model_selection import KFold
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [50]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
         }

In [51]:
NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], 
                    verbose_eval=250, 
                    early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")

Training until validation scores don't improve for 500 rounds.
[250]	training's auc: 0.962531	valid_1's auc: 0.895617
[500]	training's auc: 0.984754	valid_1's auc: 0.908416
[750]	training's auc: 0.993985	valid_1's auc: 0.915462
[1000]	training's auc: 0.99775	valid_1's auc: 0.918845
[1250]	training's auc: 0.999173	valid_1's auc: 0.920399
[1500]	training's auc: 0.999675	valid_1's auc: 0.920805
[1750]	training's auc: 0.999871	valid_1's auc: 0.921073
[2000]	training's auc: 0.999956	valid_1's auc: 0.921459
[2250]	training's auc: 0.999985	valid_1's auc: 0.921878
[2500]	training's auc: 0.999995	valid_1's auc: 0.921819
Early stopping, best iteration is:
[2240]	training's auc: 0.999984	valid_1's auc: 0.921911
Fold 1 | AUC: 0.9219112539480251
Training until validation scores don't improve for 500 rounds.
[250]	training's auc: 0.961452	valid_1's auc: 0.912725
[500]	training's auc: 0.98508	valid_1's auc: 0.925987
[750]	training's auc: 0.994693	valid_1's auc: 0.933356
[1000]	training's auc: 0.99822

In [52]:
sub['isFraud'] = y_preds
sub.to_csv("submission-20190915-B.csv", index=False)

Score according to Kaggle: **0.9452**

Next step: to reach **0.9460** (0.008 better)