In [16]:
from __future__ import division
import sys
import os
import gc

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stat

In [17]:
# Files and Paths
files_ = {
    "historical_transactions": "data/historical_transactions.csv",
    "merchants": "data/merchants.csv",
    "new_merchant_transactions": "data/new_merchant_transactions.csv",
    "train": "data/train.csv",
    "test": "data/test.csv"
}

output_dir = "output/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Deleted as it was an interim step (saving space)
prepped_data_dir = 'preprocessed/'
if not os.path.exists(prepped_data_dir):
    os.makedirs(prepped_data_dir)

binned_data_dir = 'assembled_bins/'
if not os.path.exists(binned_data_dir):
    os.makedirs(binned_data_dir)
    
df_train = pd.read_csv(files_.get('train'))
df_train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [18]:
def write_bin(data, path, fname):
    if fname in os.listdir(path):
        data.to_csv(
            path+fname, 
            mode='a', 
            header=False, 
            index=False
        )
    else:
        data.to_csv(
            path+fname,
            index=False
        )

In [None]:
# Split into Bins for Processing
df = pd.read_csv(files_.get("train"), usecols=['card_id'])
card_ids = list(pd.Series.unique(df['card_id']))
df = pd.read_csv(files_.get('test'), usecols=['card_id'])
card_ids.extend(list(pd.Series.unique(df['card_id'])))
card_ids = list(set(card_ids))
print("No. Unique Card IDs: {}".format(str(len(card_ids))))
assert len(set(card_ids)) == 325540

print(len(set([card_id[:6] for card_id in card_ids])))

# Add Merchant Details to Historical Transactions

In [19]:
# Read in tables as provided
i_ = 1
df_merchants = pd.read_csv(files_.get("merchants"))
for chunk in pd.read_csv(files_.get("historical_transactions"), chunksize=1000000):
    chunk = pd.merge(chunk, df_merchants, how='left', on=['merchant_id'], suffixes=['', '_merch'])
    write_bin(data=chunk, path=prepped_data_dir, fname="historical_transactions_preprocessed.csv")
    print("Chunk {} Complete".format(str(i_)))
    i_ += 1

Chunk 1 Complete
Chunk 2 Complete
Chunk 3 Complete
Chunk 4 Complete
Chunk 5 Complete
Chunk 6 Complete
Chunk 7 Complete
Chunk 8 Complete
Chunk 9 Complete
Chunk 10 Complete
Chunk 11 Complete
Chunk 12 Complete
Chunk 13 Complete
Chunk 14 Complete
Chunk 15 Complete
Chunk 16 Complete
Chunk 17 Complete
Chunk 18 Complete
Chunk 19 Complete
Chunk 20 Complete
Chunk 21 Complete
Chunk 22 Complete
Chunk 23 Complete
Chunk 24 Complete
Chunk 25 Complete
Chunk 26 Complete
Chunk 27 Complete
Chunk 28 Complete
Chunk 29 Complete
Chunk 30 Complete


# Historical Transactions Binning

In [22]:
if not os.path.exists(binned_data_dir):
    os.makedirs(binned_data_dir)
i_ = 1
for chunk in pd.read_csv(
    prepped_data_dir + "historical_transactions_preprocessed.csv",
    chunksize=1000000,
    dtype=str):
    chunk['binId'] = chunk['card_id'].str[:6]
    for bin_ in pd.Series.unique(chunk.binId):
        write_bin(data=chunk.loc[chunk['binId'] == bin_, :],
                  path=binned_data_dir,
                  fname="hist_bin_{}.csv".format(str(bin_)))
        print("Chunk {} Complete".format(str(i_)))
        i_ += 1

Chunk 1 Complete
Chunk 2 Complete
Chunk 3 Complete
Chunk 4 Complete
Chunk 5 Complete
Chunk 6 Complete
Chunk 7 Complete
Chunk 8 Complete
Chunk 9 Complete
Chunk 10 Complete
Chunk 11 Complete
Chunk 12 Complete
Chunk 13 Complete
Chunk 14 Complete
Chunk 15 Complete
Chunk 16 Complete
Chunk 17 Complete
Chunk 18 Complete
Chunk 19 Complete
Chunk 20 Complete
Chunk 21 Complete
Chunk 22 Complete
Chunk 23 Complete
Chunk 24 Complete
Chunk 25 Complete
Chunk 26 Complete
Chunk 27 Complete
Chunk 28 Complete
Chunk 29 Complete
Chunk 30 Complete
Chunk 31 Complete
Chunk 32 Complete
Chunk 33 Complete
Chunk 34 Complete
Chunk 35 Complete
Chunk 36 Complete
Chunk 37 Complete
Chunk 38 Complete
Chunk 39 Complete
Chunk 40 Complete
Chunk 41 Complete
Chunk 42 Complete
Chunk 43 Complete
Chunk 44 Complete
Chunk 45 Complete
Chunk 46 Complete
Chunk 47 Complete
Chunk 48 Complete
Chunk 49 Complete
Chunk 50 Complete
Chunk 51 Complete
Chunk 52 Complete
Chunk 53 Complete
Chunk 54 Complete
Chunk 55 Complete
Chunk 56 Complete
C

Chunk 438 Complete
Chunk 439 Complete
Chunk 440 Complete
Chunk 441 Complete
Chunk 442 Complete
Chunk 443 Complete
Chunk 444 Complete
Chunk 445 Complete
Chunk 446 Complete
Chunk 447 Complete
Chunk 448 Complete
Chunk 449 Complete
Chunk 450 Complete
Chunk 451 Complete
Chunk 452 Complete
Chunk 453 Complete
Chunk 454 Complete
Chunk 455 Complete
Chunk 456 Complete
Chunk 457 Complete
Chunk 458 Complete
Chunk 459 Complete
Chunk 460 Complete
Chunk 461 Complete
Chunk 462 Complete
Chunk 463 Complete
Chunk 464 Complete
Chunk 465 Complete
Chunk 466 Complete
Chunk 467 Complete
Chunk 468 Complete
Chunk 469 Complete
Chunk 470 Complete
Chunk 471 Complete
Chunk 472 Complete
Chunk 473 Complete
Chunk 474 Complete
Chunk 475 Complete
Chunk 476 Complete
Chunk 477 Complete
Chunk 478 Complete
Chunk 479 Complete
Chunk 480 Complete
Chunk 481 Complete
Chunk 482 Complete
Chunk 483 Complete
Chunk 484 Complete
Chunk 485 Complete
Chunk 486 Complete
Chunk 487 Complete
Chunk 488 Complete
Chunk 489 Complete
Chunk 490 Co

# Add Merchant Details to New Merchant Transactions

In [24]:
# Read in tables as provided
i_ = 1
df_merchants = pd.read_csv(files_.get("merchants"))
for chunk in pd.read_csv(files_.get("new_merchant_transactions"), chunksize=1000000):
    chunk = pd.merge(chunk, df_merchants, how='left', on=['merchant_id'], suffixes=['', '_merch'])
    write_bin(data=chunk, path=prepped_data_dir, fname="new_merchant_transactions_preprocessed.csv")
    print("Chunk {} Complete".format(str(i_)))
    i_ += 1

Chunk 1 Complete
Chunk 2 Complete


# New Merchant Transactions Binning

In [25]:
i_ = 1
header = True
for chunk in pd.read_csv(
    prepped_data_dir + "new_merchant_transactions_preprocessed.csv",
    chunksize=1000000,
    dtype=str):
    chunk['binId'] = chunk['card_id'].str[:6]
    for bin_ in pd.Series.unique(chunk.binId):
        write_bin(data=chunk.loc[chunk['binId'] == bin_, :],
                  path=binned_data_dir,
                  fname="new_merch_bin_{}.csv".format(str(bin_)))
    print("Chunk {} Complete".format(str(i_)))
    i_ += 1
        
        

Chunk 1 Complete
Chunk 2 Complete


# Train Binning

In [27]:
i_ = 1
for chunk in pd.read_csv(
    files_.get('train'),
    chunksize=1000000,
    dtype=str):
    chunk['binId'] = chunk['card_id'].str[:6]
    for bin_ in pd.Series.unique(chunk.binId):
        write_bin(data=chunk.loc[chunk['binId'] == bin_, :],
                  path=binned_data_dir,
                  fname='train_bin_{}.csv'.format(str(bin_)))
        print("Chunk {} Complete".format(str(i_)))
        i_ += 1

Chunk 1 Complete
Chunk 2 Complete
Chunk 3 Complete
Chunk 4 Complete
Chunk 5 Complete
Chunk 6 Complete
Chunk 7 Complete
Chunk 8 Complete
Chunk 9 Complete
Chunk 10 Complete
Chunk 11 Complete
Chunk 12 Complete
Chunk 13 Complete
Chunk 14 Complete
Chunk 15 Complete
Chunk 16 Complete


# Test Binning

In [29]:
i_ = 1
for chunk in pd.read_csv(
    files_.get('test'),
    chunksize=1000000,
    dtype=str):
    chunk['binId'] = chunk['card_id'].str[:6]
    for bin_ in pd.Series.unique(chunk.binId):
        write_bin(data=chunk.loc[chunk['binId'] == bin_, :],
                  path=binned_data_dir, 
                  fname='test_bin_{}.csv'.format(str(bin_)))
        print("Chunk {} Complete".format(str(i_)))
        i_ += 1

Chunk 1 Complete
Chunk 2 Complete
Chunk 3 Complete
Chunk 4 Complete
Chunk 5 Complete
Chunk 6 Complete
Chunk 7 Complete
Chunk 8 Complete
Chunk 9 Complete
Chunk 10 Complete
Chunk 11 Complete
Chunk 12 Complete
Chunk 13 Complete
Chunk 14 Complete
Chunk 15 Complete
Chunk 16 Complete
