In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import pickle

import math
warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 
                                                                              100*(start_mem - end_mem) / start_mem))
    return df

In [3]:
########################### Model
import lightgbm as lgb

def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  

    tt_df = tt_df[['TransactionID',target]]    
    predictions = np.zeros(len(tt_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)

        if LOCAL_TEST:
            vl_data = lgb.Dataset(P, label=P_y) 
        else:
            vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS

        if LOCAL_TEST:
            feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), 
                                       columns=['Value','Feature'])
            print(feature_imp)
        
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction'] = predictions
    
    return tt_df
## -------------------

In [4]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = False
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [5]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [6]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_pickle('../../data/input/ieee-fe-with-some-eda/train_df.pkl')

if LOCAL_TEST:
    test_df = train_df[train_df['DT_M']==train_df['DT_M'].max()].reset_index(drop=True)
    train_df = train_df[train_df['DT_M']<(train_df['DT_M'].max()-1)].reset_index(drop=True)    
else:
    test_df = pd.read_pickle('../../data/input/ieee-fe-with-some-eda/test_df.pkl')

remove_features = pd.read_pickle('../../data/input/ieee-fe-with-some-eda/remove_features.pkl')
remove_features = list(remove_features['features_to_remove'].values)
print('Shape control:', train_df.shape, test_df.shape)

Load Data
Shape control: (590540, 791) (506691, 791)


In [7]:
from lxml import html
import urllib.request
x_path_rank = '/html/body/div/div/table/tbody/tr/td[1]/div[1]/a'
x_path_traffic_rank = '/html/body/div/div/table/tbody/tr/td[2]/div[1]/a'
x_path_sites = '/html/body/div/div/table/tbody/tr/td[3]/div[1]/a'

alexa_cache = {}

In [8]:
def rank_domain(domain):
    if domain in ['', None, np.nan, 'nan']:
        return (None, None, None)
    
    if pd.isna(domain):
        return (None, None, None)
    
    if domain == 'gmail':
        domain = 'gmail.com'
    
    if domain in alexa_cache:
        return alexa_cache[domain]
    
    with urllib.request.urlopen(f"https://www.alexa.com/minisiteinfo/{domain}") as url:
        print(f'Querying Alexa for ranking of {domain}.')
        
        s = url.read()
        tree = html.fromstring(s)
        try:
            rank = int(tree.xpath(x_path_rank)[0].text_content().replace(',', ''))
        except IndexError:
            print(f'No rank for domain {domain}...')
            rank = None
        except ValueError:
            rank = 0
            
        try:
            traffic = int(tree.xpath(x_path_traffic_rank)[0].text_content().replace(',', ''))
        except IndexError:
            print(f'No rank for domain {domain}...')
            traffic = None
        except ValueError:
            traffic = 0
            
        try:
            sites = int(tree.xpath(x_path_sites)[0].text_content().replace(',', ''))
        except IndexError:
            print(f'No rank for domain {domain}...')
            sites = None       
        except ValueError:
            sites = 0
        
        alexa_cache[domain] = (rank, traffic, sites)
    
        return rank, traffic, sites

In [10]:
train_raw_emails, test_raw_emails = {}, {}
train_raw_emails['P_emaildomain'] = pickle.load(open('train_raw_Pemails.pickle', mode='rb'))
train_raw_emails['R_emaildomain'] = pickle.load(open('train_raw_Remails.pickle', mode='rb'))
test_raw_emails['P_emaildomain'] = pickle.load(open('test_raw_Pemails.pickle', mode='rb'))
test_raw_emails['R_emaildomain'] = pickle.load(open('test_raw_Remails.pickle', mode='rb'))

In [11]:
for c in ['P_emaildomain', 'R_emaildomain']:
    train_df[c + '_rank'] = [rank_domain(x)[0] for x in train_raw_emails[c].to_list()]
    test_df[c + '_rank'] = [rank_domain(x)[0] for x in test_raw_emails[c].to_list()]
    
    train_df[c + '_traffic_rank'] = [rank_domain(x)[1] for x in train_raw_emails[c].to_list()]
    test_df[c + '_traffic_rank'] = [rank_domain(x)[1] for x in test_raw_emails[c].to_list()]
    
    train_df[c + '_sites'] = [rank_domain(x)[2] for x in train_raw_emails[c].to_list()]
    test_df[c + '_sites'] = [rank_domain(x)[2] for x in test_raw_emails[c].to_list()]

Querying Alexa for ranking of gmail.com.
Querying Alexa for ranking of outlook.com.
Querying Alexa for ranking of yahoo.com.
Querying Alexa for ranking of mail.com.
Querying Alexa for ranking of anonymous.com.
No rank for domain anonymous.com...
Querying Alexa for ranking of hotmail.com.
Querying Alexa for ranking of verizon.net.
Querying Alexa for ranking of aol.com.
Querying Alexa for ranking of me.com.
No rank for domain me.com...
Querying Alexa for ranking of comcast.net.
Querying Alexa for ranking of optonline.net.
Querying Alexa for ranking of cox.net.
Querying Alexa for ranking of charter.net.
Querying Alexa for ranking of rocketmail.com.
No rank for domain rocketmail.com...
Querying Alexa for ranking of prodigy.net.mx.
Querying Alexa for ranking of embarqmail.com.
No rank for domain embarqmail.com...
Querying Alexa for ranking of icloud.com.
Querying Alexa for ranking of live.com.mx.
No rank for domain live.com.mx...
Querying Alexa for ranking of live.com.
Querying Alexa for ra

In [12]:
########################### Final features list
features_columns = [col for col in list(train_df) if col not in remove_features]

########################### Final Minification
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

Mem. usage decreased to 1291.93 Mb (41.4% reduction)
Mem. usage decreased to 1116.31 Mb (41.2% reduction)


In [13]:
train_df.iloc[2].to_dict()

{'TransactionID': 2987002,
 'isFraud': 0,
 'TransactionDT': 86469,
 'TransactionAmt': 4.094,
 'ProductCD': 0.0204,
 'card1': 4664.0,
 'card2': 490.0,
 'card3': 150.0,
 'card4': 719649.0,
 'card5': 166.0,
 'card6': 824959.0,
 'addr1': 330.0,
 'addr2': 87.0,
 'dist1': 287.0,
 'dist2': nan,
 'P_emaildomain': 35,
 'R_emaildomain': 49,
 'C1': 1.0,
 'C2': 1.0,
 'C3': 0.0,
 'C4': 0.0,
 'C5': 0.0,
 'C6': 1.0,
 'C7': 0.0,
 'C8': 0.0,
 'C9': 1.0,
 'C10': 0.0,
 'C11': 1.0,
 'C12': 0.0,
 'C13': 1.0,
 'C14': 1.0,
 'D1': 507854,
 'D2': 515566,
 'D3': 466020,
 'D4': 344864,
 'D5': 534216,
 'D6': 899261,
 'D7': 998181,
 'D8': 947967,
 'D9': 947967,
 'D10': 449929,
 'D11': 487,
 'D12': 963260,
 'D13': 911895,
 'D14': 919850,
 'D15': 683,
 'M1': 1.0,
 'M2': 1.0,
 'M3': 1.0,
 'M4': 0.03665,
 'M5': 0.0,
 'M6': 0.0,
 'M7': 0.0,
 'M8': 0.0,
 'M9': 0.0,
 'V1': 1.0,
 'V2': 1.0,
 'V3': 1.0,
 'V4': 1.0,
 'V5': 1.0,
 'V6': 1.0,
 'V7': 1.0,
 'V8': 1.0,
 'V9': 1.0,
 'V10': 0.0,
 'V11': 0.0,
 'V12': 1.0,
 'V13': 1.

In [14]:
########################### Model Train
if LOCAL_TEST:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 20000
    lgb_params['early_stopping_rounds'] = 100
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params)
    print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction']))
else:
    lgb_params['learning_rate'] = 0.005
    lgb_params['n_estimators'] = 1800
    lgb_params['early_stopping_rounds'] = 100    
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=8)

Fold: 0
516722 73818
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.949151	valid_1's auc: 0.932875
[400]	training's auc: 0.974908	valid_1's auc: 0.950543
[600]	training's auc: 0.988108	valid_1's auc: 0.961124
[800]	training's auc: 0.994375	valid_1's auc: 0.967363
[1000]	training's auc: 0.997391	valid_1's auc: 0.97067
[1200]	training's auc: 0.998753	valid_1's auc: 0.972998
[1400]	training's auc: 0.999387	valid_1's auc: 0.97437
[1600]	training's auc: 0.999691	valid_1's auc: 0.97546
[1800]	training's auc: 0.999844	valid_1's auc: 0.976297
Did not meet early stopping. Best iteration is:
[1800]	training's auc: 0.999844	valid_1's auc: 0.976297
Fold: 1
516722 73818
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.949678	valid_1's auc: 0.932012
[400]	training's auc: 0.974743	valid_1's auc: 0.95126
[600]	training's auc: 0.98822	valid_1's auc: 0.962914
[800]	training's auc: 0.994405	valid_1's auc: 0.969128
[1000]	traini

In [15]:
########################### Export
if not LOCAL_TEST:
    test_predictions['isFraud'] = test_predictions['prediction']
    test_predictions[['TransactionID','isFraud']].to_csv('submission-20190916-A.csv', index=False)