In [1]:
#  Libraries
import tqdm
import numpy as np 
import pandas as pd
import gc
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

In [2]:
#import in dataset
f = open("features_optimized.txt","r")
features_list = []
features_list = f.read().splitlines()
f.close()
print (features_list)

train = pd.read_csv("train_prepared.csv")
test = pd.read_csv("test_prepared.csv")

['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V10', 'V11', 'V12', 'V13', 'V19', 'V20', 'V29', 'V30', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V47', 'V48', 'V49', 'V52', 'V53', 'V54', 'V56', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V74', 'V75', 'V76', 'V78', 'V81', 'V82', 'V83', 'V85', 'V87', 'V90', 'V91', 'V94', 'V95', 'V96', 'V99', 'V126', 'V127', 'V128', 'V130', 'V131', 'V139', 'V140', 'V143', 'V149', 'V150', 'V152', 'V156', 'V159', 'V160', 'V164', 'V165', 'V166', 'V170', 'V187', 'V189', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V218', 'V220', 'V221', 'V222', '

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
#encode the dataframe
for col in tqdm.tqdm(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))  

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 355/355 [00:28<00:00, 12.33it/s]


In [5]:
#reduce mem usage
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

#isolate test and traindata
test_data = test[features_list]
train_data = train[features_list]

Mem. usage decreased to 425.77 Mb (73.4% reduction)
Mem. usage decreased to 372.56 Mb (72.8% reduction)


In [6]:
result_list = test["TransactionID"].to_frame(name="TransactionID")
mydf = train['isFraud']
target = mydf

In [7]:
del(test)
del(train)
gc.collect()

40

In [8]:
%%time
#Training
train_input = train_data.index[:4*len(train_data)//5]
train_output = train_data.index[4*len(train_data)//5:]

clf = xgb.XGBClassifier( 
    n_estimators=20000,
    max_depth=10000, 
    learning_rate=0.01, 
    subsample=0.8,
    colsample_bytree=0.4, 
    missing=-1, 
    eval_metric='auc',
    tree_method='gpu_hist',
    predictor = 'gpu_predictor',
    reg_lambda = 2,
    reg_alpha = 1
    
)
h = clf.fit(train_data.loc[train_input,features_list], target[train_input], 
    eval_set=[(train_data.loc[train_output,features_list],target[train_output])],
    verbose=50, early_stopping_rounds=500)


[0]	validation_0-auc:0.83040
Will train until validation_0-auc hasn't improved in 500 rounds.
[50]	validation_0-auc:0.87071
[100]	validation_0-auc:0.87885
[150]	validation_0-auc:0.88354
[200]	validation_0-auc:0.88905
[250]	validation_0-auc:0.89428
[300]	validation_0-auc:0.89924
[350]	validation_0-auc:0.90302
[400]	validation_0-auc:0.90660
[450]	validation_0-auc:0.90947
[500]	validation_0-auc:0.91230
[550]	validation_0-auc:0.91495
[600]	validation_0-auc:0.91667
[650]	validation_0-auc:0.91811
[700]	validation_0-auc:0.91943
[750]	validation_0-auc:0.92031
[800]	validation_0-auc:0.92083
[850]	validation_0-auc:0.92122
[900]	validation_0-auc:0.92158
[950]	validation_0-auc:0.92167
[1000]	validation_0-auc:0.92181
[1050]	validation_0-auc:0.92197
[1100]	validation_0-auc:0.92203
[1150]	validation_0-auc:0.92204
[1200]	validation_0-auc:0.92209
[1250]	validation_0-auc:0.92194
[1300]	validation_0-auc:0.92185
[1350]	validation_0-auc:0.92172
[1400]	validation_0-auc:0.92167
[1450]	validation_0-auc:0.9218

In [9]:
%%time
y_pred = clf.predict_proba(test_data)[:, 1]#convert into binary values
result_list["isFraud"] = y_pred
results = result_list
results.to_csv("XGBoost_predictions.csv",index=False)

Wall time: 23.5 s
