In [1]:
#  Libraries
import tqdm
import numpy as np 
import pandas as pd 
# Data processing, metrics and modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold,KFold
from bayes_opt import BayesianOptimization
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve
from sklearn import metrics
from sklearn import preprocessing
# Lgbm
import lightgbm as lgb
# Suppr warning
import warnings
warnings.filterwarnings("ignore")

import itertools
from scipy import interp

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams

In [2]:
f = open("features_optimized.txt","r")
features_list = []
features_list = f.read().splitlines()
f.close()
print (features_list)

train = pd.read_csv("train_prepared.csv")
test = pd.read_csv("test_prepared.csv")

['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140', 'V

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
for col in tqdm.tqdm(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))  

100%|████████████████████████████████████████████████████████████████████████████████| 355/355 [00:36<00:00,  9.83it/s]


In [5]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 425.77 Mb (73.4% reduction)
Mem. usage decreased to 372.56 Mb (72.8% reduction)


In [6]:
test_data = test[features_list]
train_data = train[features_list]
result_list = test["TransactionID"]

In [7]:
mydf = train['isFraud']
target = mydf

In [8]:
del(test)
del(train)

In [9]:
d_train = lgb.Dataset(train_data, label=target)

In [10]:
%%time
params= {'num_leaves': 693,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          #"bagging_seed": 11,
          "metric": 'auc',
          #"verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          #'random_state': 47
         }

result = lgb.train(params,d_train,1000,early_stopping_rounds = 20,valid_sets = d_train)
result.save_model('lgb_model.txt', num_iteration=result.best_iteration) 


[1]	training's auc: 0.891805
Training until validation scores don't improve for 20 rounds
[2]	training's auc: 0.916676
[3]	training's auc: 0.926042
[4]	training's auc: 0.931578
[5]	training's auc: 0.936281
[6]	training's auc: 0.940978
[7]	training's auc: 0.945137
[8]	training's auc: 0.948853
[9]	training's auc: 0.952509
[10]	training's auc: 0.956439
[11]	training's auc: 0.959907
[12]	training's auc: 0.961975
[13]	training's auc: 0.966442
[14]	training's auc: 0.968828
[15]	training's auc: 0.971148
[16]	training's auc: 0.973239
[17]	training's auc: 0.975129
[18]	training's auc: 0.977767
[19]	training's auc: 0.979201
[20]	training's auc: 0.981022
[21]	training's auc: 0.982798
[22]	training's auc: 0.984173
[23]	training's auc: 0.985469
[24]	training's auc: 0.986337
[25]	training's auc: 0.987226
[26]	training's auc: 0.988011
[27]	training's auc: 0.988861
[28]	training's auc: 0.989508
[29]	training's auc: 0.990014
[30]	training's auc: 0.99073
[31]	training's auc: 0.991248
[32]	training's auc

[281]	training's auc: 1
[282]	training's auc: 1
[283]	training's auc: 1
[284]	training's auc: 1
[285]	training's auc: 1
[286]	training's auc: 1
[287]	training's auc: 1
[288]	training's auc: 1
[289]	training's auc: 1
[290]	training's auc: 1
[291]	training's auc: 1
[292]	training's auc: 1
[293]	training's auc: 1
[294]	training's auc: 1
[295]	training's auc: 1
[296]	training's auc: 1
[297]	training's auc: 1
[298]	training's auc: 1
[299]	training's auc: 1
[300]	training's auc: 1
[301]	training's auc: 1
[302]	training's auc: 1
[303]	training's auc: 1
[304]	training's auc: 1
[305]	training's auc: 1
[306]	training's auc: 1
[307]	training's auc: 1
[308]	training's auc: 1
[309]	training's auc: 1
[310]	training's auc: 1
[311]	training's auc: 1
[312]	training's auc: 1
[313]	training's auc: 1
[314]	training's auc: 1
[315]	training's auc: 1
[316]	training's auc: 1
[317]	training's auc: 1
[318]	training's auc: 1
[319]	training's auc: 1
[320]	training's auc: 1
[321]	training's auc: 1
[322]	training's

[622]	training's auc: 1
[623]	training's auc: 1
[624]	training's auc: 1
[625]	training's auc: 1
[626]	training's auc: 1
[627]	training's auc: 1
[628]	training's auc: 1
[629]	training's auc: 1
[630]	training's auc: 1
[631]	training's auc: 1
[632]	training's auc: 1
[633]	training's auc: 1
[634]	training's auc: 1
[635]	training's auc: 1
[636]	training's auc: 1
[637]	training's auc: 1
[638]	training's auc: 1
[639]	training's auc: 1
[640]	training's auc: 1
[641]	training's auc: 1
[642]	training's auc: 1
[643]	training's auc: 1
[644]	training's auc: 1
[645]	training's auc: 1
[646]	training's auc: 1
[647]	training's auc: 1
[648]	training's auc: 1
[649]	training's auc: 1
[650]	training's auc: 1
[651]	training's auc: 1
[652]	training's auc: 1
[653]	training's auc: 1
[654]	training's auc: 1
[655]	training's auc: 1
[656]	training's auc: 1
[657]	training's auc: 1
[658]	training's auc: 1
[659]	training's auc: 1
[660]	training's auc: 1
[661]	training's auc: 1
[662]	training's auc: 1
[663]	training's

[965]	training's auc: 1
[966]	training's auc: 1
[967]	training's auc: 1
[968]	training's auc: 1
[969]	training's auc: 1
[970]	training's auc: 1
[971]	training's auc: 1
[972]	training's auc: 1
[973]	training's auc: 1
[974]	training's auc: 1
[975]	training's auc: 1
[976]	training's auc: 1
[977]	training's auc: 1
[978]	training's auc: 1
[979]	training's auc: 1
[980]	training's auc: 1
[981]	training's auc: 1
[982]	training's auc: 1
[983]	training's auc: 1
[984]	training's auc: 1
[985]	training's auc: 1
[986]	training's auc: 1
[987]	training's auc: 1
[988]	training's auc: 1
[989]	training's auc: 1
[990]	training's auc: 1
[991]	training's auc: 1
[992]	training's auc: 1
[993]	training's auc: 1
[994]	training's auc: 1
[995]	training's auc: 1
[996]	training's auc: 1
[997]	training's auc: 1
[998]	training's auc: 1
[999]	training's auc: 1
[1000]	training's auc: 1
Did not meet early stopping. Best iteration is:
[805]	training's auc: 1
Wall time: 4min 32s


<lightgbm.basic.Booster at 0x1ac5b826488>

In [11]:
#Prediction
y_pred = result.predict(test_data)#convert into binary values
for i in range(len(test_data)):
    if y_pred[i]>=.5:       # setting threshold to .5
           y_pred[i]=1
    else:  
           y_pred[i]=0

In [12]:
# test["isFraud"] = y_pred
result_list["isFraud"] = y_pred
results = result_list
results.to_csv("predictions.csv",index=False)

AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'