In [1]:
import os
import pathlib

import numpy as np
import pandas as pd

import fastai
from fastai import metrics
from fastai.tabular import FillMissing, Categorify, Normalize, DatasetType
from fastai.tabular import TabularDataBunch, tabular_learner, TabularList, load_learner
print(fastai.__version__)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
INPUT = pathlib.Path('/kaggle/input')
MODELS = INPUT / 'ieee-fastai-wd'

NROWS = None

1.0.60
/kaggle/input/ieee-reduce-mem/__results__.html
/kaggle/input/ieee-reduce-mem/custom.css
/kaggle/input/ieee-reduce-mem/test.pkl
/kaggle/input/ieee-reduce-mem/__notebook__.ipynb
/kaggle/input/ieee-reduce-mem/__output__.json
/kaggle/input/ieee-reduce-mem/train.pkl
/kaggle/input/ieee-reduce-mem/__results___files/__results___28_2.png
/kaggle/input/ieee-fastai-wd/__results__.html
/kaggle/input/ieee-fastai-wd/400-100-model_1.pkl
/kaggle/input/ieee-fastai-wd/custom.css
/kaggle/input/ieee-fastai-wd/__notebook__.ipynb
/kaggle/input/ieee-fastai-wd/__output__.json
/kaggle/input/ieee-fastai-wd/__results___files/__results___12_2.png
/kaggle/input/ieee-fastai-wd/__results___files/__results___10_2.png
/kaggle/input/ieee-fastai-wd/models/tmp.pth


In [2]:
def load_test_data():
    test = pd.read_pickle(INPUT / 'ieee-reduce-mem' / 'test.pkl')
    if NROWS:
        test = test[:NROWS]
    return test

def test_pipeline(test):
    print(f"==test_pipeline==")
    test = test.reset_index(drop=True)
    procs = [FillMissing, Categorify, Normalize]
    cat_names, cont_names = get_cat_names(test)
    data = TabularList.from_df(test, cat_names=cat_names, cont_names=cont_names, procs=procs)
    return data

def get_cat_names(df):
    """Get a list of all category column names"""
    print(f"\n==get_cat_names==")
    cards = [f"card{i}" for i in range(1, 7)]
    matches = [f"M{i}" for i in range(1, 10)]
    trx_cats = ["ProductCD", "addr1", "addr2", "P_emaildomain", "R_emaildomain", *cards, *matches]
    ids = [f"id_{i}" for i in range(12, 39)]
    id_cats = ["DeviceType", "DeviceInfo", *ids]
    cat_names = trx_cats + id_cats
    cat_names = [x for x in cat_names if x in df.columns]
    cont_names = [x for x in df.columns if x not in cat_names]

    excluded_columns = ['TransactionID', 'TransactionDT', 'isFraud']
    for col in excluded_columns:
        for col_list in [cat_names, cont_names]:
            if col in col_list:
                col_list.remove(col)
    print(f"cat_names: {len(cat_names)} {cat_names[:10]}...")
    print(f"cont_names: {len(cont_names)} {cont_names[:10]}...")
    return cat_names, cont_names

def get_test_learner():
    test = load_test_data()
    test = test_pipeline(test)
    learn = load_learner(MODELS,'400-100-model_1.pkl', test=test)
    return learn

def save_test_set_preds(preds, indexes):
    df1 = pd.DataFrame(
        {'TransactionID': indexes,
         'isFraud': [float(x) for x in preds[0]]},
    )
    df1.to_csv('submission1.csv', index=False)
    df2 = pd.DataFrame(
        {'TransactionID': indexes,
         'isFraud': [float(x) for x in preds[1]]},
    )
    df2.to_csv('submission2.csv', index=False)

## TEST AGAINST KNOWN LABELS
* Lets save validation test output for later analysis

In [3]:
learn = get_test_learner()
preds, y = learn.get_preds(ds_type=DatasetType.Test)
preds = list(zip(*preds))
indexes = learn.data.test_ds.x.inner_df['TransactionID']

==test_pipeline==

==get_cat_names==
cat_names: 49 ['ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'card1', 'card2', 'card3', 'card4', 'card5']...
cont_names: 382 ['TransactionAmt', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7']...




In [4]:
save_test_set_preds(preds, indexes)

### Concluding Thoughts
The score against the validation set was 0.93. When tested against the public test set, it scored significantly worse (~0.87). I don't think this is due to overfitting. Most of the other kernels experienced a similar deterioration in score and it's likely because the test dataset and the train dataset were split by time. There were other significant differences between the train and test datasets as well, such as the proportion of missing of null values in each column. Future research should focus on how to handle these differences. Figuring out which features to exclude/manipulate will likely improve performance more than tweaking the model hyperparameters.