In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
import sklearn.tree as tree

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_transaction = pd.read_csv('/content/drive/My Drive/Fraud Credit Dataset/train_transaction.csv', index_col='TransactionID')
train_identity = pd.read_csv('/content/drive/My Drive/Fraud Credit Dataset/train_identity.csv', index_col='TransactionID')
test_transaction = pd.read_csv('/content/drive/My Drive/Fraud Credit Dataset/test_transaction.csv', index_col='TransactionID')
test_identity = pd.read_csv('/content/drive/My Drive/Fraud Credit Dataset/test_identity.csv', index_col='TransactionID')

In [4]:
train = train_transaction.merge(train_identity, how = 'left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how = 'left', left_index=True, right_index=True)

In [5]:
tempData = train_transaction[["TransactionAmt","ProductCD","card4","isFraud"]] #Decision Tree taking only 4 Columns

In [6]:
#Stratified sampling - Splitting a data set to ensure that the train and test sets have approximately the same percentage of samples of each target class as the complete set.
decData = tempData.groupby('isFraud').apply(lambda x: x.sample(n=15000))
decData.reset_index(drop=True, inplace=True)
decData.replace({"card4":{"american express":0,"discover":1,"mastercard":2,"visa":3}, "ProductCD":{'C':0,'H':1,'R':2,'S':3,'W':4}}, inplace=True)
decData.dropna(axis=0, inplace=True)

In [7]:
from sklearn.model_selection import train_test_split

transFraudLabel = decData.loc[ : , "isFraud" ]
transCardData = decData.loc[ : , "TransactionAmt" : "card4" ]
decTrain, decTest, decTrainLabel, decTestLabel = train_test_split(transCardData, transFraudLabel)

In [8]:
from sklearn.metrics import accuracy_score

decTree =  tree.DecisionTreeClassifier()
decTree.fit(decTrain, decTrainLabel)
decPred = decTree.predict(decTest)
print(accuracy_score(decPred, decTestLabel))

0.6944147514698022


In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
 
randFor = RandomForestRegressor()
randFor.fit(decTrain, decTrainLabel)
randPred = randFor.predict(decTest)
print(roc_auc_score(decTestLabel, randPred))

0.7664210512032361


In [9]:
Y_train = train['isFraud']
X_train = train.drop('isFraud', axis=1)

In [10]:
X_test = test

In [11]:
del train, test, train_transaction, train_identity, test_transaction, test_identity

In [12]:
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

In [13]:
X_train

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,...,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2987000,86400,68.50,W,13926,-999.0,150.0,discover,142.0,credit,315.0,87.0,19.0,-999.0,-999,-999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,-999.0,13.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,13.0,13.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
2987001,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,-999.0,-999.0,gmail.com,-999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
2987002,86469,59.00,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,-999.0,outlook.com,-999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,315.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
2987003,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,-999.0,-999.0,yahoo.com,-999,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,-999.0,-999.0,-999.0,-999.0,84.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
2987004,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,-999.0,-999.0,gmail.com,-999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,0.0,70787.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,100.0,NotFound,-999.0,-480.0,New,NotFound,166.0,-999.0,542.0,144.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3577535,15811047,49.00,W,6550,-999.0,150.0,visa,226.0,debit,272.0,87.0,48.0,-999.0,-999,-999,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,3.0,2.0,29.0,29.0,30.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,56.0,56.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
3577536,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,debit,204.0,87.0,-999.0,-999.0,gmail.com,-999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
3577537,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,debit,231.0,87.0,-999.0,-999.0,gmail.com,-999,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
3577538,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,debit,387.0,87.0,3.0,-999.0,aol.com,-999,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,1.0,1.0,5.0,1.0,22.0,22.0,0.0,22.0,0.0,-999.0,-999.0,-999.0,-999.0,22.0,22.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999


In [14]:
X_test

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,...,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,id-11,id-12,id-13,id-14,id-15,id-16,id-17,id-18,id-19,id-20,id-21,id-22,id-23,id-24,id-25,id-26,id-27,id-28,id-29,id-30,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
3663549,18403224,31.950,W,10409,111.0,150.0,visa,226.0,debit,170.0,87.0,1.0,-999.0,gmail.com,-999,6.0,6.0,0.0,0.0,3.0,4.0,0.0,0.0,6.0,0.0,5.0,1.0,115.0,6.0,419.0,419.0,27.0,398.0,27.0,-999.0,-999.0,-999.000000,-999.000000,418.0,203.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
3663550,18403263,49.000,W,4272,111.0,150.0,visa,226.0,debit,299.0,87.0,4.0,-999.0,aol.com,-999,3.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,12.0,2.0,149.0,149.0,7.0,634.0,7.0,-999.0,-999.0,-999.000000,-999.000000,231.0,634.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
3663551,18403310,171.000,W,4476,574.0,150.0,visa,226.0,debit,472.0,87.0,2635.0,-999.0,hotmail.com,-999,2.0,2.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,2.0,0.0,22.0,2.0,137.0,137.0,10.0,97.0,10.0,-999.0,-999.0,-999.000000,-999.000000,136.0,136.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
3663552,18403310,284.950,W,10989,360.0,150.0,visa,166.0,debit,205.0,87.0,17.0,-999.0,gmail.com,-999,5.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,7.0,4.0,42.0,42.0,41.0,242.0,41.0,-999.0,-999.0,-999.000000,-999.000000,242.0,242.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
3663553,18403317,67.950,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,87.0,6.0,-999.0,gmail.com,-999,6.0,6.0,0.0,0.0,2.0,5.0,0.0,0.0,5.0,0.0,6.0,0.0,14.0,6.0,22.0,22.0,0.0,22.0,0.0,-999.0,-999.0,-999.000000,-999.000000,22.0,22.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,34214279,94.679,C,13832,375.0,185.0,mastercard,224.0,debit,284.0,60.0,-999.0,-999.0,gmail.com,gmail.com,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,-999.0,-999.0,0.0,-999.0,0.0,-999.0,-999.000000,-999.000000,0.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
4170236,34214287,12.173,C,3154,408.0,185.0,mastercard,224.0,debit,-999.0,-999.0,-999.0,157.0,hotmail.com,hotmail.com,1.0,3.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0,2.0,1.0,1.0,3.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,-999.000000,-999.000000,0.0,-999.0,...,-45.0,266704.0,-999.0,-999.0,-3.0,-10.0,-999.0,-999.0,-999.0,-999.0,100.0,NotFound,27.0,-999.0,New,NotFound,225.0,15.0,176.0,507.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,New,NotFound,-999,chrome 43.0 for android,-999.0,-999,-999,F,F,T,F,mobile,ALE-L23 Build/HuaweiALE-L23
4170237,34214326,49.000,W,16661,490.0,150.0,visa,226.0,debit,327.0,87.0,-999.0,-999.0,hotmail.com,-999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.000000,-999.000000,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999
4170238,34214337,202.000,W,16621,516.0,150.0,mastercard,224.0,debit,177.0,87.0,-999.0,-999.0,hotmail.com,-999,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.000000,-999.000000,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999,-999,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999,-999.0,-999.0,-999.0,-999,-999,-999,-999,-999,-999.0,-999,-999,-999,-999,-999,-999,-999,-999


In [15]:
for column in X_test:
  if '-' in column:
    string1 = column
    string2 = column[0] + column[1] + '_' + column[3] + column[4]
    X_test.rename(columns = {string1:string2}, inplace = True)

In [16]:
for trainCol in X_train.columns:
    if (X_train[trainCol].dtype == 'object') or (X_test[trainCol].dtype == 'object'): 
        lbl = preprocessing.LabelEncoder()
        encodeLab = list(X_train[trainCol].values) + list(X_test[trainCol].values)
        lbl.fit(encodeLab)
        X_train[trainCol] = lbl.transform(list(X_train[trainCol].values))
        X_test[trainCol] = lbl.transform(list(X_test[trainCol].values))

In [17]:
clf = xgb.XGBClassifier(tree_method='gpu_hist')

clf.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='gpu_hist', verbosity=1)

In [18]:
finPreds = clf.predict_proba(X_test)[:,1]

In [19]:
predYtrain = clf.predict(X_train)

In [20]:
print(accuracy_score(Y_train, predYtrain))

0.9737748501371626


In [21]:
from sklearn.model_selection import train_test_split
feature_train, feature_test, label_train, label_test = train_test_split(X_train, Y_train)

clf2 = xgb.XGBClassifier(tree_method='gpu_hist')
clf2.fit(feature_train, label_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, tree_method='gpu_hist', verbosity=1)

In [22]:
predYtest = clf2.predict(feature_test)
print(accuracy_score(label_test, predYtest))

0.9738612117722762
