In [67]:
import re
import math

import numpy as np
import pandas as pd
import pickle

In [68]:
transactions_train = pd.read_csv('./data/train_transaction.csv')
transactions_test = pd.read_csv('./data/test_transaction.csv')

In [69]:
def extract_match_count(m4_value):
    if type(m4_value) == float: # nan
        return m4_value
    else:
        return float(re.search('M(\d)', m4_value)[1]) # digit in string

In [70]:
for col_name in transactions_train:
    if (transactions_train[col_name].dtype not in ('float64', 'int64')):
        print(f'{col_name}: {transactions_train[col_name].dtype}')

ProductCD: object
card4: object
card6: object
P_emaildomain: object
R_emaildomain: object
M1: object
M2: object
M3: object
M4: object
M5: object
M6: object
M7: object
M8: object
M9: object


In [71]:
# Vesta's description of the columns:

# TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
# TransactionAMT: transaction payment amount in USD
# ProductCD: product code, the product for each transaction
# card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
# addr: address
# dist: distance
# P_ and (R__) emaildomain: purchaser and recipient email domain
# C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
# D1-D15: timedelta, such as days between previous transaction, etc.
# M1-M9: match, such as names on card and address, etc.
# Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

# Categorical Features:
# ProductCD
# card1 - card6
# addr1, addr2
# Pemaildomain Remaildomain
# M1 - M9

def clean(transaction_orig, verbose=False):
    transactions = transaction_orig.copy()
    # The 'M' columns should be boolean
    for col_name in transactions:
        if re.match('M\d', col_name) and col_name != 'M4':
            trues = transactions[col_name] == 'T'
            falses = transactions[col_name] == 'F'
            transactions.loc[trues, col_name] = True
            transactions.loc[falses, col_name] = False
            transactions[col_name] = transactions[col_name].astype('float')
    # except M4, which seems to be counting a number of some kind of match (?)
    transactions.M4 = transactions.M4.apply(extract_match_count)
    return transactions.set_index('TransactionID');

In [72]:
with open('./data/train_transaction_clean.pkl', 'wb') as train_clean_file:
    pickle.dump(clean(transactions_train, verbose=True), train_clean_file)

with open('./data/test_transaction_clean.pkl', 'wb') as test_clean_file:
    pickle.dump(clean(transactions_test, verbose=True), test_clean_file)