In [None]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random
import statistics
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import auc
from sklearn import metrics
import tqdm

import math
warnings.filterwarnings('ignore')

SEED = 4041

In [None]:
#import in dataset
f = open("features_optimized.txt","r")
features_list = []
features_list = f.read().splitlines()
f.close()
print (features_list)

train = pd.read_csv("train_prepared.csv")
test = pd.read_csv("test_prepared.csv")

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#fill nan with -999
test.fillna(-999,inplace=True)
train.fillna(-999,inplace=True)

In [None]:
#encode the dataframe
for col in tqdm.tqdm(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

In [None]:
#reduce mem usage
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
#define catboost parameters
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'use_best_model' : True,
    'depth' : 12,
    'l2_leaf_reg':20,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}
cat_params1 = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'l2_leaf_reg':20,
    'use_best_model' : True,
    'depth' : 10,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}
cat_params2 = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'l2_leaf_reg':60,
    'use_best_model' : True,
    'depth' : 10,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}
cat_params3 = {
    'loss_function': 'Logloss',
    'eval_metric':'AUC',
    'iterations' : 10000,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':500,
    'od_wait':500,
    'l2_leaf_reg':100,
    'use_best_model' : True,
    'depth' : 10,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}

paramsList = [cat_params,cat_params1,cat_params2,cat_params3]

In [None]:
#isolate test and traindata
test_data = test[features_list]
train_data = train[features_list]

In [None]:
result_list = test["TransactionID"].to_frame(name="TransactionID")
mydf = train['isFraud']
target = mydf

In [None]:
del(test)
del(train)
gc.collect()

In [None]:
#prepare folds for validation
folds = StratifiedKFold(n_splits=5,shuffle=False)

In [None]:
# X,y = train_data, target    
# P,P_y = test_data, target 

In [None]:
# oof = np.zeros(len(train_data))
# predictions = np.zeros(len(test_data))

In [None]:
pred_temp = pd.read_csv('sample_submission.csv')

In [None]:
def createModel(folds,train, target,estimator):
#     estimator = CatBoostClassifier(**cat_params)  
    aucList= []
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
        print('Fold:',fold_)


        estimator.fit(train.iloc[trn_idx,:],target[trn_idx],eval_set=(train.iloc[val_idx,:], target[val_idx]))
        score = estimator.get_best_score()
        aucList.append(score["validation"]["AUC"])
    print('Model Completed: \n')
    print('Average AUC: {}\n'.format(statistics.mean(aucList)))
    print('Variance in AUC: {}\n'.format((max(aucList)-min(aucList))))

In [None]:
cat_params_best = {
    'loss_function': 'Logloss',
#     'eval_metric':'AUC',
    'iterations' : 2300,
    'learning_rate' : 0.05,
    'random_seed' : SEED,
    'metric_period':100,
#     'od_wait':500,
#     'use_best_model' : True,
    'depth' : 12,
    'l2_leaf_reg':20,
    'min_data_in_leaf' : 256,
#     'early_stopping_rounds' : 200,
    'task_type' : 'GPU',
    'verbose': True
}

In [None]:
final_estimator = CatBoostClassifier(**cat_params_best)
final_estimator.fit(train_data,target)
results = final_estimator.predict_proba(test_data)[:,1]
pred_temp['isFraud'] = results
pred_temp.to_csv("cat_best.csv",index=False)