Training a simple random forest, mostly to examine feature importance but also as a benchmark to measure other models against.

In [1]:
import os
import gc
import sys
import random
from pathlib import Path
from pprint import pprint, pformat
from functools import partial

import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

py_version = sys.version.replace('\n', ' ')
print(f"Python version: {py_version}\n")
print(f"Sklearn version: {sklearn.__version__}")

INPUT = Path('/kaggle/input/ieee-reduce-mem')
pprint(list(INPUT.glob('*')))

Python version: 3.6.6 |Anaconda, Inc.| (default, Oct  9 2018, 12:34:16)  [GCC 7.3.0]

Sklearn version: 0.21.3
[PosixPath('/kaggle/input/ieee-reduce-mem/test.pkl'),
 PosixPath('/kaggle/input/ieee-reduce-mem/train.pkl'),
 PosixPath('/kaggle/input/ieee-reduce-mem/__results___files'),
 PosixPath('/kaggle/input/ieee-reduce-mem/custom.css'),
 PosixPath('/kaggle/input/ieee-reduce-mem/__notebook__.ipynb'),
 PosixPath('/kaggle/input/ieee-reduce-mem/__results__.html'),
 PosixPath('/kaggle/input/ieee-reduce-mem/__output__.json')]


In [2]:
# Cleaning categorical data
def get_cat_names(df):
    """Split up the categorical and numeric columns"""
    cards = [f"card{i}" for i in range(1, 7)]
    matches = [f"M{i}" for i in range(1, 10)]
    trx_cats = ["ProductCD", "addr1", "addr2", "P_emaildomain", "R_emaildomain", *cards, *matches]
    ids = [f"id_{i}" for i in range(12, 39)]
    id_cats = ["DeviceType", "DeviceInfo", *ids]
    cat_names = trx_cats + id_cats
    cat_names = [x for x in cat_names if x in df.columns]
    cont_names = [x for x in df.columns if x not in cat_names]
    return cat_names, cont_names

def find_null_cols(cat_cols, train, test):
    """List all columns with at least 1 null value"""
    list1 = train.columns[train.isna().any()].tolist()
    list2 = test.columns[test.isna().any()].tolist()
    null_num_cols = list(set(list1 + list2) - set(cat_cols))
    return null_num_cols

def get_all_allowed_categories(cat_names, train, test):
    """
    Returns a list of all categorical values for all categorical columns.
    Note: cat columns should already be category dtype
    """
    allowed_cats = dict()
    for col in cat_names:
        list1 = train[col].cat.categories.tolist()
        list2 = test[col].cat.categories.tolist()
        values = list(set(list1 + list2))
        allowed_cats[col] = values
    return allowed_cats

def create_is_null_cols(null_num_cols, df):
    """Creates new 'is_na_x' columns for each column that has na values"""
    for col in null_num_cols:
        new_col = f'{col}_isna'
        median = df[col].median()
        df[new_col] = df[col].isnull()
        df[col] = df[col].fillna(median)
    return df

def convert_na_values(cat_cols, df):
    """Convert na values into a 'missing' category value"""
    for col in cat_cols:
        if not 'missing' in df[col].cat.categories:
            df[col].cat.add_categories('missing', inplace=True)
        df[col] = df[col].fillna("missing")
    return df

def add_unknown_categories(allowed_categories, test_df):
    """
    Unknown values are values in the test set which do not appear in the training set,
    we have to categorize them differently from missing values
    """    
    for cat, allowed_values in allowed_categories.items():
        df_test[cat][~df_test[cat].isin(allowed_values)] = 'unknown'
    return test_df

def convert_cats_to_numeric(cat_cols, df):
    """Explicitely convert categorical columns into integer codes"""
    for col in cat_cols:
        df[col] = df[col].cat.codes
    return df

def convert_category_type(cat_cols, df):
    """Explicitely convert categorical columns into integer codes"""
    for col in cat_cols:
        df[col] = df[col].astype('category')
    return df

In [3]:
train = pd.read_pickle(INPUT / 'train.pkl')
test = pd.read_pickle(INPUT / 'test.pkl')

In [4]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [5]:
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.950001,W,10409,111.0,150.0,visa,226.0,debit,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,chrome 67.0 for android,24.0,1280x720,match_status:2,T,F,T,T,mobile,LGLS676 Build/MXB48T
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,ie 11.0 for tablet,,,,F,T,T,F,desktop,Trident/7.0
3,3663552,18403310,284.950012,W,10989,360.0,150.0,visa,166.0,debit,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
4,3663553,18403317,67.949997,W,18018,452.0,150.0,mastercard,117.0,debit,...,chrome 67.0 for android,,,,F,F,T,F,mobile,SM-G9650 Build/R16NW


In [6]:
test.columns, train.columns

(Index(['TransactionID', 'TransactionDT', 'TransactionAmt', 'ProductCD',
        'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
        ...
        'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
        'DeviceType', 'DeviceInfo'],
       dtype='object', length=433),
 Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
        'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
        ...
        'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
        'DeviceType', 'DeviceInfo'],
       dtype='object', length=434))

In [7]:
# Pipeline
def make_pipeline(train, test):
    cat_cols, _ = get_cat_names(train)

    train = convert_category_type(cat_cols, train)
    test = convert_category_type(cat_cols, test)

    allowed_categories = get_all_allowed_categories(cat_cols, train, test)
    null_cols = find_null_cols(cat_cols, train, test)

    train = create_is_null_cols(null_cols, train)
    test = create_is_null_cols(null_cols, test)

    train = convert_na_values(cat_cols, train)
    test = convert_na_values(cat_cols, test)

    train = convert_cats_to_numeric(cat_cols, train)
    test = convert_cats_to_numeric(cat_cols, test)
    return train, test

train, test = make_pipeline(train, test)

In [8]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V43_isna,V312_isna,V146_isna,id_04_isna,V66_isna,V207_isna,V17_isna,V154_isna,V19_isna,V39_isna
0,2987000,0,86400,68.5,4,10095,500,42,1,38,...,True,False,True,True,False,True,False,True,False,True
1,2987001,0,86401,29.0,4,1372,303,42,2,2,...,False,False,True,True,False,True,False,True,False,False
2,2987002,0,86469,59.0,4,2833,389,42,3,58,...,False,False,True,False,False,True,False,True,False,False
3,2987003,0,86499,50.0,4,13341,466,42,2,14,...,False,False,True,True,False,True,False,True,False,False
4,2987004,0,86506,50.0,1,2712,413,42,2,2,...,True,False,False,False,True,False,True,False,True,True


In [9]:
test.tail()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V43_isna,V312_isna,V146_isna,id_04_isna,V66_isna,V207_isna,V17_isna,V154_isna,V19_isna,V39_isna
506686,4170235,34214279,94.679001,0,9746,273,74,2,93,2,...,False,False,True,True,False,True,False,True,False,False
506687,4170236,34214287,12.173,0,1630,306,74,2,93,2,...,False,False,True,True,False,False,False,True,False,False
506688,4170237,34214326,49.0,4,11938,388,45,3,94,2,...,False,False,True,True,False,True,False,True,False,False
506689,4170238,34214337,202.0,4,11908,414,45,2,93,2,...,False,False,True,True,False,True,False,True,False,False
506690,4170239,34214345,24.346001,0,3552,68,39,3,38,1,...,False,False,True,True,False,False,False,True,False,False


In [10]:
# Save Output
train.to_pickle("clean_train.pkl")
test.to_pickle("clean_test.pkl")
del test
gc.collect()

0

## DATA CLEANING COMPLETE!

In [11]:
pct = 0.2
train, valid = train_test_split(train, test_size=pct)

In [12]:
train_idx = train['TransactionID']
train = train.drop('TransactionID', axis=1)
train = train.drop('TransactionDT', axis=1)

valid_idx = valid['TransactionID']
valid = valid.drop('TransactionID', axis=1)
valid = valid.drop('TransactionDT', axis=1)

In [13]:
train_y = train['isFraud']
train_x = train.drop('isFraud', axis=1)
valid_y = valid['isFraud']
valid_x = valid.drop('isFraud', axis=1)

In [14]:
del train, valid
gc.collect()

10

In [15]:
model = RandomForestRegressor(n_estimators=200, max_features=0.3,
                              min_samples_leaf=20, n_jobs=-1, verbose=1,
                              oob_score=False)

In [16]:
model.fit(train_x, train_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 47.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 49.6min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=0.3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=20, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=1,
                      warm_start=False)

In [17]:
valid_preds = model.predict(valid_x)
auc_score = roc_auc_score(valid_y, valid_preds)
print(f"AUC: {int(auc_score * 100)}%")

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    3.9s finished


AUC: 92%


In [18]:
len(valid_preds), len(valid_idx), len(valid_y), len(valid_x)

(118108, 118108, 118108, 118108)

In [19]:
validation = pd.DataFrame({'TransactionID':valid_idx,
                           'isFraud_preds':valid_preds, 
                           'isFraud_actual':valid_y})
validation.head(20)

Unnamed: 0,TransactionID,isFraud_preds,isFraud_actual
105383,3092383,0.003303,0
19382,3006382,0.002657,0
256058,3243058,0.000179,0
408920,3395920,0.051556,0
224308,3211308,0.011187,0
46352,3033352,0.0,0
252953,3239953,0.005366,0
392368,3379368,0.006021,0
330497,3317497,0.02746,0
137293,3124293,0.008791,0


In [20]:
del train_y, valid_x, valid_y, valid_preds, validation
gc.collect()

12

In [21]:
def pred_and_save(pkl):
    df = pd.read_pickle(pkl)
    idx = df['TransactionID']
    df = df.drop('TransactionID', axis=1)
    df = df.drop('TransactionDT', axis=1)
    if 'isFraud' in df.columns:
        actual = df['isFraud']
        df = df.drop('isFraud', axis=1)
    else:
        actual = list()
    preds = model.predict(df)
    data = {'TransactionID': idx, 
            'isFraud_preds': preds}
    if len(actual):
        data['isFraud_actual'] = actual
    
    data = pd.DataFrame(data)
    fname = pkl.rsplit('.', 1)[-1] + '.csv'
    data.to_csv(fname, index=False)
    data.head()

In [22]:
pred_and_save("clean_train.pkl")

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   17.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   18.3s finished


In [23]:
pred_and_save("clean_test.pkl")

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   14.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   15.1s finished


In [24]:
import joblib
joblib.dump(model, "random_forest.sav")

['random_forest.sav']