In [1]:
import os 
import gc

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

In [None]:
%%time

# LOAD TRAIN
X_train = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_id = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')

train_tr_len = X_train.shape[0]
train_id_len = train_id.shape[0]

tr_cols = X_train.columns
id_cols = train_id.columns

print("Shape of train transactions =", X_train.shape)
print("Shape of train identity =", train_id.shape)

X_train = X_train.merge(train_id, how='left', on='TransactionID')

# LOAD TEST
X_test = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_id = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

test_tr_len = X_test.shape[0]
test_id_len = test_id.shape[0]

print("Shape of train transactions =", X_test.shape)
print("Shape of train identity =", test_id.shape)

fix = {o:n for o, n in zip(test_id.columns, train_id.columns)}
test_id.rename(columns=fix, inplace=True)

X_test = X_test.merge(test_id, how='left', on='TransactionID')

# TARGET

y_train = X_train['isFraud'].copy()
del train_id, test_id, X_train['isFraud']; x = gc.collect()

In [None]:
X_train.shape, X_test.shape, y_train.shape

Check if all columns have been named correctly

In [None]:
set(X_train.columns) == set(X_test.columns)

### Drop columns based on number of null values

In [None]:
id_ratio = 1 - train_id_len/train_tr_len
id_ratio

Approximately $76\%$ of the training set doesn't have identity information, so we cannot drop ID columns which have $\le 76\%$ nulls

In [None]:
X_train_nulls = X_train.isnull().sum()/X_train.shape[0]
max(X_train_nulls)

In [None]:
id_cols

In [None]:
drop_id_cols = [col for col in id_cols if X_train_nulls.loc[col] >= 0.9]
drop_tr_cols = [col for col in tr_cols if col != 'isFraud' and X_train_nulls.loc[col] >= 0.9]

drop_null_cols = drop_id_cols + drop_tr_cols

In [None]:
X_train.drop(drop_null_cols, axis=1, inplace=True)
X_test.drop(drop_null_cols, axis=1, inplace=True)

In [None]:
X_train.shape, X_test.shape

### Drop Correlated Columns

In [None]:
def make_corr(df, cols, title=''):
    
    cols = ['TransactionDT'] + cols
    
    plt.figure(figsize=(15,15))
    sns.heatmap(df[cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
    
    if title != '': 
        plt.title(title, fontsize=14)
    else:
        plt.title(cols[0]+' - '+cols[-1],fontsize=14)
    plt.show()

In [None]:
def detect_corr_groups(df, col_subset, threshold=0.9):
    cor = df[col_subset].corr()
    cor.loc[:,:] =  np.tril(cor, k=-1)
    cor = cor.stack()
    return list(cor[cor > threshold].keys())

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    
    master_list = []
    for i in range(0, len(lst), n):
        master_list.append(lst[i:i + n])
        
    return master_list

In [None]:
v_cols = [col for col in X_train if col.startswith('V')]
v_cols = chunks(v_cols, 20)

In [None]:
make_corr(X_train, v_cols[0])

In [None]:
def reduce_corr_tuple(df, col1, col2):
    remove_col = None
    
    if df[col1].nunique() > df[col2].nunique():
        remove_col = col1
    else:
        remove_col = col2

    return remove_col

In [None]:
remove_sets = set()

for i in tqdm(range(len(v_cols))):
    remove_set = set()
    
    corr_pairs = detect_corr_groups(X_train, v_cols[i])
    
    for pair in corr_pairs:
        if pair[0] in remove_set or pair[1] in remove_set:
            continue
        else:
            remove_col = reduce_corr_tuple(X_train, pair[0], pair[1])
            remove_set.add(remove_col)
            
    remove_sets = remove_sets.union(remove_set)

In [None]:
X_train.drop(list(remove_sets), axis=1, inplace=True)
X_test.drop(list(remove_sets), axis=1, inplace=True)

In [None]:
X_train.shape, X_test.shape

In [None]:
test_cols = chunks([col for col in X_train.columns if col.startswith('V')], 20)[1]

make_corr(X_train, test_cols)

We can see that there are still columns which are correlated with each other. This is problematic and we should make one more pass to remove any other correlated columns.

In [None]:
%%time

v_cols = [col for col in X_train.columns if col.startswith('V')]

remove_set = set()
    
corr_pairs = detect_corr_groups(X_train, v_cols)

for i in range(len(corr_pairs)):
    pair = corr_pairs[i]
    if pair[0] in remove_set or pair[1] in remove_set:
        continue
    else:
        remove_col = reduce_corr_tuple(X_train, pair[0], pair[1])
        remove_set.add(remove_col)

In [None]:
X_train.drop(list(remove_set), axis=1, inplace=True)
X_test.drop(list(remove_set), axis=1, inplace=True)

In [None]:
X_train.shape, X_test.shape

### Encode Categorical Variables

- One-hot encode: `ProductCD`, `card4`,`card6`, `M4`, `id_12`, `id_15`, `id_34`
- Freq encode: `P_emaildomain`, `R_emaildomain`, `id_30`, `id_31`, `id_33`, `DeviceInfo`
- Binary encode: `M1`, `M2`, `M3`, `M5`, `M6`, `M7`, `M8`, `M9`, `id_16`, `id_28`, `id_29`, `id_35`, `id_36`, `id_37`, `id_38`, `deviceType`

In [None]:
one_hot_cols = ['ProductCD', 'card4', 'card6', 'M4', 'id_12', 'id_15', 'id_34']
binary_cols = ['M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_16', 'id_28',
               'id_29', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType']
freq_cols = ['P_emaildomain', 'R_emaildomain', 'id_30', 'id_31', 'id_33', 'DeviceInfo']

In [None]:
set(X_train.columns) == set(X_test.columns)

Ensure that binary variables have the same values in the test set

In [None]:
for col in binary_cols:
    assert set(X_train[col]) == set(X_test[col])

### Binary Encoding

In [None]:
for col in binary_cols:
    mapping = dict(zip([val for val in X_train[col].unique() if not pd.isnull(val)],[0, 1]))
    X_train[col] = X_train[col].replace(mapping).astype(float)
    X_test[col] = X_test[col].replace(mapping).astype(float)

In [None]:
X_train.shape, X_test.shape

In [None]:
df = pd.DataFrame({'x': ['a', 'b', 'a', 'b']})

dum = pd.get_dummies(['x'])
df = pd.get_dummies([df, dum], axis=1)

### One-Hot Encoding

In [None]:
for col in one_hot_cols:
    X_train[col + '_one_hot'] = X_train[col].copy()
    X_test[col + '_one_hot'] = X_test[col].copy()

X_train = pd.get_dummies(X_train, columns=one_hot_cols)
X_test = pd.get_dummies(X_test, columns=one_hot_cols)

X_train.shape, X_test.shape

In [None]:
missing_cols = set(X_train.columns) - set(X_test.columns)

for col in missing_cols:
    X_test[col] = 0

In [None]:
X_train.rename({col + '_one_hot': col for col in one_hot_cols}, axis=1, inplace=True)
X_test.rename({col + '_one_hot': col for col in one_hot_cols}, axis=1, inplace=True)

In [None]:
X_train.shape, X_test.shape

### Frequency Encoder

In [None]:
class FreqEncoder:
    
    def __init__(self, col):
        self.col = col
        self.total_rows = 0
        self.value_map = {}
        
    def freq_map(self):
        return {k: v/self.total_rows for k, v in self.value_map.items()}
    
    def predict_values(self, row):
        if pd.isnull(row[self.col]):
            return row[self.col]
        
        return self.value_map.get(row[self.col], 0)/self.total_rows
        
    def encode(self, row):
        self.total_rows += 1
        
        if pd.isnull(row[self.col]):
            return row[self.col]
        
        value = row[self.col]
        
        curr_freq = self.value_map.get(value, 0)
        curr_rows = self.total_rows - 1 if self.total_rows != 1 else 1
        
        self.value_map[value] = curr_freq + 1
        
        return curr_freq/curr_rows

In [None]:
for i in tqdm(range(len(freq_cols))):
    col = freq_cols[i]
    
    fe = FreqEncoder(col)
    X_train[col + '_fe'] = X_train.apply(fe.encode, axis=1)
    X_test[col + '_fe'] = X_test.apply(fe.predict_values, axis=1)

### Expanding Window Aggregations on Identity Features

In [None]:
class TimeBasedExpandingAggregation:
    
    def _init_(self, col1, col2):
        self.col1 = col1
        self.col2 = col2
        self.agg_map = {}
        self.value_map = {}
        
    def freq_map(self):
        return {k: v/self.agg_map.get(row[self.col2], 1) for k, v in self.value_map.items()}
    
    def predict_values(self, row):
        
        if pd.isnull(row[self.col1]):
            return row[self.col1]
        
        if pd.isnull(row[self.col2]):
            return row[self.col2]
        
        agg = row[self.col1]
        value = row[self.col2]
        return self.value_map.get((value,agg), 0)/self.agg_map.get(agg, 1)
        
    def encode(self, row):
        
        if pd.isnull(row[self.col1]):
            return row[self.col1]
        
        if pd.isnull(row[self.col2]):
            return row[self.col2]
        
        agg = row[self.col1]
        value = row[self.col2]
        
        curr_freq = self.value_map.get((value,agg), 0)
        curr_rows = self.agg_map.get(agg, 0)
        
        self.value_map[(value, agg)] = curr_freq + 1
        self.agg_map[agg] = curr_rows + 1
        
        if curr_rows == 0:
            return curr_freq
        
        return curr_freq/curr_rows

In [None]:
id_columns = ['addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'card4', 'card6']

In [None]:
X_train['isFraud'] = y_train

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train.to_csv("train_set_without_id_features.csv")
X_test.to_csv("test_set_without_id_features.csv")

In [None]:
X_train = pd.read_csv("../input/frauddatawithoutaggregations/train_set_without_id_features.csv")
X_test = pd.read_csv("../input/frauddatawithoutaggregations/test_set_without_id_features.csv")

X_train.shape, X_test.shape

In [None]:
from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


id_combinations = [list(l) for l in list(powerset(id_columns)) if 1 <= len(l) <= 4]

In [None]:
agg_cols = ['P_emaildomain', 'R_emaildomain', 'card4', 'card6']
for i in tqdm(range(len(agg_cols))):
    col = agg_cols[i]
   
    fa = TimeBasedExpandingAggregation(col,"isFraud")
    X_train[col + '_fa'] = X_train.apply(fa.encode, axis=1)
    X_test[col + '_fa'] = X_test.apply(fa.predict_values, axis=1)

X_train = X_train.drop(columns=['P_emaildomain', 'R_emaildomain','card4','card6'])
X_test = X_test.drop(columns=['P_emaildomain', 'R_emaildomain','card4','card6'])

X_train.to_csv("train_set_without_id_features_with_agg.csv")
X_test.to_csv("test_set_without_id_features_with_agg.csv")

In [None]:
X_train.to_csv("train_set_without_id_features_with_agg.csv")
X_test.to_csv("test_set_without_id_features_with_agg.csv")