In [None]:
#  Libraries
from tqdm import tqdm_notebook
from time import sleep
import numpy as np 
import pandas as pd
import gc
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [None]:
#import in dataset
f = open("features_optimized.txt","r")
features_list = []
features_list = f.read().splitlines()
f.close()
print (features_list)

train = pd.read_csv("train_prepared.csv")
test = pd.read_csv("test_prepared.csv")

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# #encode the dataframe
for col in tqdm_notebook(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))  

In [None]:
#reduce mem usage
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

#isolate test and traindata
test_data = test[features_list]
train_data = train[features_list]

In [None]:
result_list = test["TransactionID"].to_frame(name="TransactionID")
mydf = train['isFraud']
target = mydf

In [None]:
del(test)
del(train)
gc.collect()

In [None]:
#perform cv to show how accurate the params are
params = {
    'objective':'binary:logistic',
    'colsample_bytree': 0.4,
    'subsample':0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'reg_alpha': 0,
    'reg_lambda':2
}
d_train = xgb.DMatrix(data= train_data, label = target)

xgb_cv = xgb.cv(dtrain=d_train, params=params, nfold=5,
                    num_boost_round=1000, 
                    early_stopping_rounds=50,
                    metrics='auc',
                    verbose_eval=50)

In [None]:
#actual classifier
xgb_clf = xgb.XGBClassifier(n_estimators=1000,
                                max_depth=10, 
                                learning_rate=0.01,
                                subsample=0.8,
                                colsample_bytree=0.4,
                                missing=-1,
                                eval_metric='auc',
                                tree_method='exact',
                                predictor = 'cpu_predictor',
                                reg_lambda = 2,
                                reg_alpha = 0 )
#train model on whole train set
model = xgb_clf.fit(train_data,target)

#use the train model to predict probability of fradulent transactions based on test set data
y_pred = xgb_clf.predict_proba(test_data)[:, 1]#convert into binary values
result_list["isFraud"] = y_pred
results = result_list
#write results to csv
results.to_csv("XGBoost_predictions.csv",index=False)