In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random
import statistics
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import auc
from sklearn import metrics
import tqdm

import math
warnings.filterwarnings('ignore')

SEED = 4041

In [2]:
#import in dataset
f = open("features_optimized.txt","r")
features_list = []
features_list = f.read().splitlines()
f.close()
print (features_list)

train = pd.read_csv("train_prepared.csv")
test = pd.read_csv("test_prepared.csv")

['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V10', 'V11', 'V12', 'V13', 'V19', 'V20', 'V29', 'V30', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V47', 'V48', 'V49', 'V52', 'V53', 'V54', 'V56', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V74', 'V75', 'V76', 'V78', 'V81', 'V82', 'V83', 'V85', 'V87', 'V90', 'V91', 'V94', 'V95', 'V96', 'V99', 'V126', 'V127', 'V128', 'V130', 'V131', 'V139', 'V140', 'V143', 'V149', 'V150', 'V152', 'V156', 'V159', 'V160', 'V164', 'V165', 'V166', 'V170', 'V187', 'V189', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V218', 'V220', 'V221', 'V222', '

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
#fill nan with -999
test.fillna(-999,inplace=True)
train.fillna(-999,inplace=True)

In [5]:
#encode the dataframe
for col in tqdm.tqdm(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

100%|████████████████████████████████████████████████████████████████████████████████| 355/355 [00:42<00:00,  8.35it/s]


In [6]:
#reduce mem usage
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 425.77 Mb (73.4% reduction)
Mem. usage decreased to 372.56 Mb (72.8% reduction)


In [7]:
#define catboost parameters
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'use_best_model' : True,
    'depth' : 12,
    'l2_leaf_reg':20,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}
cat_params1 = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'l2_leaf_reg':20,
    'use_best_model' : True,
    'depth' : 10,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}
cat_params2 = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'l2_leaf_reg':60,
    'use_best_model' : True,
    'depth' : 10,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}
cat_params3 = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'l2_leaf_reg':100,
    'use_best_model' : True,
    'depth' : 10,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}

paramsList = [cat_params,cat_params1,cat_params2,cat_params3]

In [8]:
#isolate test and traindata
test_data = test[features_list]
train_data = train[features_list]

In [9]:
result_list = test["TransactionID"].to_frame(name="TransactionID")
mydf = train['isFraud']
target = mydf

In [10]:
del(test)
del(train)
gc.collect()

80

In [11]:
#prepare folds for validation
folds = StratifiedKFold(n_splits=5,shuffle=False)

In [12]:
# X,y = train_data, target    
# P,P_y = test_data, target 

In [13]:
# oof = np.zeros(len(train_data))
# predictions = np.zeros(len(test_data))

In [14]:
pred_temp = pd.read_csv('sample_submission.csv')

In [15]:
def createModel(folds,train, target,estimator):
#     estimator = CatBoostClassifier(**cat_params)  
    aucList= []
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
        print('Fold:',fold_)


        estimator.fit(train.iloc[trn_idx,:],target[trn_idx],eval_set=(train.iloc[val_idx,:], target[val_idx]))
        score = estimator.get_best_score()
        aucList.append(score["validation"]["AUC"])
    print('Model Completed: \n')
    print('Average AUC: {}\n'.format(statistics.mean(aucList)))
    print('Variance in AUC: {}\n'.format((max(aucList)-min(aucList))))

In [18]:
cat_params_best = {
    'loss_function': 'Logloss',
#     'eval_metric':'AUC',
    'iterations' : 2300,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':100,
#     'od_wait':500,
#     'use_best_model' : True,
    'depth' : 12,
    'l2_leaf_reg':20,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}

In [22]:
final_estimator = CatBoostClassifier(**cat_params_best)
final_estimator.fit(train_data,target)
results = final_estimator.predict_proba(test_data)[:,1]
pred_temp['isFraud'] = results
pred_temp.to_csv("cat_best.csv",index=False)

0:	learn: 0.6016658	total: 129ms	remaining: 4m 55s
100:	learn: 0.0827476	total: 12.1s	remaining: 4m 24s
200:	learn: 0.0732190	total: 23.2s	remaining: 4m 2s
300:	learn: 0.0676976	total: 34.2s	remaining: 3m 47s
400:	learn: 0.0631380	total: 45.1s	remaining: 3m 33s
500:	learn: 0.0594666	total: 56.3s	remaining: 3m 22s
600:	learn: 0.0566594	total: 1m 7s	remaining: 3m 10s
700:	learn: 0.0538338	total: 1m 18s	remaining: 2m 59s
800:	learn: 0.0515561	total: 1m 29s	remaining: 2m 47s
900:	learn: 0.0493255	total: 1m 40s	remaining: 2m 36s
1000:	learn: 0.0474912	total: 1m 51s	remaining: 2m 25s
1100:	learn: 0.0459235	total: 2m 2s	remaining: 2m 13s
1200:	learn: 0.0441884	total: 2m 13s	remaining: 2m 2s
1300:	learn: 0.0429191	total: 2m 24s	remaining: 1m 50s
1400:	learn: 0.0417473	total: 2m 35s	remaining: 1m 39s
1500:	learn: 0.0405040	total: 2m 46s	remaining: 1m 28s
1600:	learn: 0.0394036	total: 2m 56s	remaining: 1m 17s
1700:	learn: 0.0382123	total: 3m 7s	remaining: 1m 6s
1800:	learn: 0.0371032	total: 3m 1