# Find Similar Weeks
- Train Seperate Model Based on Week Similarities

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pylab as plt
%matplotlib inline

train_id = pd.read_csv('../../input/train_identity.csv')
test_id = pd.read_csv('../../input/test_identity.csv')
train_tr = pd.read_csv('../../input/train_transaction.csv')
test_tr = pd.read_csv('../../input/test_transaction.csv')
ss = pd.read_csv('../../input/sample_submission.csv')

train_id = train_id.set_index('TransactionID')
train_tr = train_tr.set_index('TransactionID')
test_id = test_id.set_index('TransactionID')
test_tr = test_tr.set_index('TransactionID')

In [3]:
train = train_tr.merge(train_id, left_index=True, right_index=True, how='outer')
test = test_tr.merge(test_id, left_index=True, right_index=True, how='outer')

In [7]:
train.to_parquet('../../input/train.parquet')
test.to_parquet('../../input/test.parquet')

In [8]:
train.dtypes

isFraud             int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
card1               int64
                   ...   
id_36              object
id_37              object
id_38              object
DeviceType         object
DeviceInfo         object
Length: 433, dtype: object

In [3]:
# https://www.kaggle.com/kevinbonnes/transactiondt-starting-at-2017-12-01
START_DATE = '2017-12-01'
startdate = dt.datetime.strptime(START_DATE, '%Y-%m-%d')
train['date'] = train['TransactionDT'].apply(lambda x: (startdate + dt.timedelta(seconds = x)))
test['date'] = test['TransactionDT'].apply(lambda x: (startdate + dt.timedelta(seconds = x)))

In [4]:
train = train.set_index('date')
test = test.set_index('date')

In [5]:
def ts_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    df = df.copy()
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    return df

In [6]:
train = ts_features(train)
test = ts_features(test)

In [16]:
ss

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5
...,...,...
506686,4170235,0.5
506687,4170236,0.5
506688,4170237,0.5
506689,4170238,0.5


In [11]:
train.head()

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [14]:
train['card6'].unique()

array(['credit', 'debit', nan, 'debit or credit', 'charge card'],
      dtype=object)

In [8]:
train_wk = train.drop(['isFraud','date', 'hour', 'quarter', 'month', 'year',
       'dayofyear', 'dayofmonth','card1',
       'card2', 'card3', 'card4', 'card5', 'card6','TransactionDT'], axis=1) \
    .groupby(['weekofyear','dayofweek'])['TransactionAmt'].agg(['mean','sum']).unstack()

In [9]:
test_wk = test.drop(['date', 'hour', 'quarter', 'month', 'year',
       'dayofyear', 'dayofmonth','card1',
       'card2', 'card3', 'card4', 'card5', 'card6','TransactionDT'], axis=1) \
    .groupby(['weekofyear','dayofweek'])['TransactionAmt'].agg(['mean','sum']).unstack()

In [10]:
train_wk.index = [f'tr_{x}' for x in train_wk.index]
test_wk.index = [f'te_{x}' for x in test_wk.index]

In [109]:
wks = pd.concat([train_wk, test_wk])

In [110]:
from sklearn.neighbors import NearestNeighbors

In [113]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
wks = imp.fit_transform(wks)
wks_tr = imp.fit_transform(train_wk)
wks_te = imp.fit_transform(test_wk)

from sklearn.preprocessing import StandardScaler

In [114]:
ss = StandardScaler()

In [115]:
wks_tr = ss.fit_transform(wks_tr)
wks_te = ss.fit_transform(wks_te)

In [150]:
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(wks)

In [151]:
distances, indices = nbrs.kneighbors(wks)

In [152]:
wks_combined = pd.concat([train_wk, test_wk])

In [157]:
[[wks_combined.iloc[x[0]].name,
  wks_combined.iloc[x[1]].name,
 wks_combined.iloc[x[2]].name,
 wks_combined.iloc[x[3]].name,
 wks_combined.iloc[x[4]].name,
  wks_combined.iloc[x[5]].name,
  wks_combined.iloc[x[6]].name,
  wks_combined.iloc[x[7]].name,] for x in indices]

[['tr_1', 'tr_22', 'tr_3', 'tr_2', 'tr_5', 'tr_11', 'te_31', 'te_44'],
 ['tr_2', 'te_1', 'tr_11', 'tr_13', 'tr_3', 'tr_12', 'te_31', 'tr_6'],
 ['tr_3', 'te_1', 'tr_19', 'tr_17', 'tr_2', 'tr_16', 'tr_4', 'tr_11'],
 ['tr_4', 'tr_19', 'tr_16', 'tr_21', 'te_1', 'tr_17', 'tr_15', 'te_28'],
 ['tr_5', 'tr_13', 'tr_2', 'tr_18', 'te_50', 'tr_8', 'tr_49', 'te_1'],
 ['tr_6', 'te_1', 'tr_11', 'tr_12', 'te_31', 'tr_2', 'tr_14', 'tr_21'],
 ['tr_7', 'tr_12', 'tr_11', 'te_31', 'te_1', 'tr_2', 'tr_6', 'te_29'],
 ['tr_8', 'te_44', 'tr_22', 'tr_13', 'tr_3', 'tr_2', 'te_1', 'tr_5'],
 ['tr_9', 'te_51', 'tr_50', 'tr_48', 'tr_5', 'tr_49', 'tr_13', 'tr_51'],
 ['tr_10', 'tr_49', 'te_49', 'tr_18', 'tr_12', 'tr_11', 'te_50', 'te_1'],
 ['tr_11', 'tr_12', 'te_1', 'tr_20', 'tr_2', 'tr_21', 'tr_6', 'tr_3'],
 ['tr_12', 'tr_11', 'te_1', 'tr_21', 'tr_2', 'tr_7', 'tr_6', 'tr_20'],
 ['tr_13', 'tr_2', 'te_1', 'te_50', 'tr_12', 'tr_11', 'tr_4', 'te_35'],
 ['tr_14', 'tr_6', 'tr_18', 'tr_12', 'te_49', 'te_31', 'te_29', 'te_5

# Public Test
Weeks 27-32

     ['te_27', 'te_45', 'tr_21', 'tr_6', 'tr_15', 'tr_20', 'tr_4', 'tr_16'],
     ['te_28', 'tr_16', 'te_45', 'tr_15', 'tr_19', 'te_37', 'te_30', 'tr_4'],
     ['te_29', 'tr_6', 'tr_15', 'te_32', 'tr_7', 'te_31', 'te_40', 'tr_17'],
     ['te_30', 'te_37', 'te_42', 'te_46', 'te_41', 'te_43', 'tr_15', 'te_33'],
     ['te_31', 'tr_6', 'tr_2', 'te_1', 'tr_3', 'tr_7', 'te_40', 'tr_11'],
     ['te_32', 'te_42', 'te_38', 'te_43', 'tr_15', 'te_41', 'te_33', 'te_34'],



In [11]:
train_for_pub_test = train.query('weekofyear in [21, 16, 6, 15, 3, 15]')

In [12]:
train_for_pub_test.shape

(98469, 442)

# Train Model only on subset of training data that looks like public test
https://www.kaggle.com/pipboyguy/catboost-and-eda

In [15]:

import numpy as np
import pandas as pd
import os, sys, gc, warnings, random

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import auc
import shap

from tqdm import tqdm

import math
warnings.filterwarnings('ignore')

SEED = 10



In [16]:
X = train_for_pub_test.drop(["isFraud"],axis=1)
y= train_for_pub_test["isFraud"]
X_Test = test.copy()

#X_Test.drop(['TransactionID', 'isFraud'],axis=1,inplace=True) #getting rid of the trans.ID that

In [17]:
print(f"Before dropna, top missing columns:\n{X.isna().sum().sort_values(ascending = False).head(5)}\n")

thresh = 0.80 #how many NA values (%) I think anything more than 80% is a bit too much. This is of course only my opinion

X_less_nas = X.dropna(thresh=X.shape[0]*(1-thresh), axis='columns')

cols_dropped  = list(set(X.columns)-set(X_less_nas.columns))

X_Test.drop(cols_dropped, axis=1, inplace=True)

# X_less_nas = reduce_mem_usage(X_less_nas)
# X_Test = reduce_mem_usage(X_Test)

print(f"After dropna, top missing columns:\n{X_less_nas.isna().sum().sort_values(ascending = False).head(5)}")

print(f"\nNo. of cols dropped = {len(set(X.columns)-set(X_less_nas.columns))}, or {len(set(X.columns)-set(X_less_nas.columns))/len(X.columns)*100:.2f}% of columns")

del X ; gc.collect()

Before dropna, top missing columns:
id_24    97937
id_25    97891
id_21    97890
id_08    97890
id_07    97890
dtype: int64

After dropna, top missing columns:
M5       54849
dist1    54326
M7       52673
M8       52672
M9       52672
dtype: int64

No. of cols dropped = 208, or 47.17% of columns


24

In [18]:
#according to https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#latest-607486

Catfeats = ['ProductCD'] + \
           ["card"+f"{i+1}" for i in range(6)] + \
           ["addr"+f"{i+1}" for i in range(2)] + \
           ["P_emaildomain", "R_emaildomain"] + \
           ["M"+f"{i+1}" for i in range(9)] + \
           ["DeviceType", "DeviceInfo"] + \
           ["id_"+f"{i}" for i in range(12, 39)]

# removing columns dropped earlier when we weeded out the empty columns

Catfeats = list(set(Catfeats)- set(cols_dropped))

In [19]:
Numfeats = list(set(X_less_nas.columns)- set(cols_dropped)-set(Catfeats))


In [20]:
X_less_nas[Catfeats].head()

Unnamed: 0_level_0,card6,M7,card2,M5,M9,card1,P_emaildomain,M6,addr1,card4,M3,card3,M8,M4,M2,addr2,card5,ProductCD,M1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-01-15 00:00:05,debit,,111.0,T,,7207,yahoo.com,F,204.0,visa,,150.0,,M0,,87.0,226.0,W,
2018-01-15 00:01:00,debit,,105.0,,,13534,gmail.com,T,512.0,visa,,150.0,,,,87.0,226.0,W,
2018-01-15 00:01:11,debit,,268.0,,,12577,gmail.com,T,325.0,visa,T,150.0,,,T,87.0,166.0,W,T
2018-01-15 00:01:24,credit,,296.0,,,5009,gmail.com,,,visa,,185.0,,M2,,,102.0,C,
2018-01-15 00:01:35,debit,F,555.0,T,T,14408,gmail.com,F,420.0,visa,T,150.0,F,M0,T,87.0,226.0,W,T


In [21]:
X_less_nas.fillna(-10000, inplace=True)
X_Test.fillna(-10000, inplace=True)

In [180]:
X_less_nas.head()


Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,P_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D10,D11,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,...,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,date,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1
2018-01-15 00:00:05,3888005,280.0,W,7207,111.0,150.0,visa,226.0,debit,204.0,87.0,-10000.0,yahoo.com,313.0,275.0,0.0,0.0,104.0,323.0,0.0,0.0,193.0,0.0,253.0,1.0,641.0,195.0,173.0,172.0,1.0,103.0,1.0,340.0,-10000.0,340.0,-10000,-10000,-10000,M0,T,F,-10000,-10000,-10000,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,19.0,8.0,0.0,0.0,0.0,1.0,19.0,8.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,390.0,6891.0,2875.0,0.0,0.0,0.0,390.0,6891.0,2875.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,1.0,4.0,2.0,1.0,21.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,390.0,8935.0,3155.0,0.0,985.0,0.0,280.0,0.0,0.0,0.0,390.0,7950.0,2875.0,0.0,0.0,0.0,2018-01-15 00:00:05,0,0,1,1,2018,15,15,3
2018-01-15 00:01:00,3888060,57.95,W,13534,105.0,150.0,visa,226.0,debit,512.0,87.0,-10000.0,gmail.com,2.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,17.0,2.0,380.0,380.0,49.0,379.0,49.0,379.0,-10000.0,379.0,-10000,-10000,-10000,-10000,-10000,T,-10000,-10000,-10000,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-15 00:01:00,0,0,1,1,2018,15,15,3
2018-01-15 00:01:11,3888071,47.95,W,12577,268.0,150.0,visa,166.0,debit,325.0,87.0,0.0,gmail.com,2.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,11.0,1.0,524.0,524.0,15.0,524.0,15.0,72.0,498.0,524.0,T,T,T,-10000,-10000,T,-10000,-10000,-10000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,57.950001,0.0,0.0,57.950001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,57.950001,0.0,0.0,57.950001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-15 00:01:11,0,0,1,1,2018,15,15,3
2018-01-15 00:01:24,3888084,30.199,C,5009,296.0,185.0,visa,102.0,credit,-10000.0,-10000.0,-10000.0,gmail.com,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-10000.0,0.0,1.0,0.0,0.0,-10000.0,1.0,-10000,-10000,-10000,M2,-10000,-10000,-10000,-10000,-10000,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.1994,30.1994,30.1994,0.0,0.0,0.0,30.1994,30.1994,30.1994,0.0,0.0,0.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,30.1994,30.1994,30.1994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.1994,30.1994,30.1994,0.0,0.0,0.0,2018-01-15 00:01:24,0,0,1,1,2018,15,15,3
2018-01-15 00:01:35,3888095,49.0,W,14408,555.0,150.0,visa,226.0,debit,420.0,87.0,54.0,gmail.com,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,-10000.0,-10000.0,0.0,-10000.0,0.0,0.0,0.0,T,T,T,M0,T,F,F,F,T,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-15 00:01:35,0,0,1,1,2018,15,15,3


In [None]:
## quick test with AUC

X_tr, X_val, y_tr, y_val = train_test_split(X_less_nas, y, test_size=0.2, random_state=SEED,stratify = y)

cat_params = {
    'loss_function': 'Logloss',
    'custom_loss':['AUC'],
    'logging_level':'Silent',
    'task_type' : 'CPU',
    'early_stopping_rounds' : 100,
    'num_boost_round' : 5000000
}

simple_model = CatBoostClassifier(**cat_params)

simple_model.fit(
    X_tr, y_tr,
    cat_features=Catfeats,
    eval_set=(X_val, y_val),
    plot=False,
);

# cv_params = model.get_params()

# cv_data = cv(
#     Pool( X.iloc[:2000,:5], y[:2000], `=[1]),
#     cv_params,nfold=4,
#     plot=True
# )


In [198]:
X_tr.shape

(78775, 233)

In [199]:
simple_model.best_score_

{'learn': {'Logloss': 0.04512705208681134},
 'validation_0': {'Logloss': 0.055547764193315835, 'AUC': 0.9591917910877962}}

In [200]:
simple_model.best_iteration_

999

In [194]:
#final training on whole trianing set
cat_params = {
    'loss_function': 'Logloss',
    'custom_loss':['AUC'],
    'logging_level':'Silent',
    'task_type' : 'CPU',
    'early_stopping_rounds' : 100
}

simple_model = CatBoostClassifier(**cat_params)
simple_model.fit(
    X_less_nas, y,
    cat_features=Catfeats,
);



In [196]:
submission = pd.read_csv('../../inputs/sample_submission.csv')
submission['isFraud'] = simple_model.predict_proba(X_Test)[:,1] # you must predict a probability for the isFraud variable
submission.to_csv('simple_model_Catboost.csv', index=False)