In [41]:
import csv
from itertools import islice
import pickle
import time

import pandas as pd
import tqdm
from sklearn import linear_model
from tpot import TPOTClassifier

from models.dummy import DummyModel

In [12]:
PREDICTION_FILE_HEADERS = 'TransactionID,isFraud\n'
FEATURE_COLUMNS = [ # quick and dirty
#   'TransactionID',
#   'isFraud',
  'TransactionDT',
  'TransactionAmt',
#   'ProductCD',
  'card1',
  'card2',
  'card3',
#   'card4',
  'card5',
#   'card6',
  'addr1',
  'addr2',
  'dist1',
  'dist2',
#   'P_emaildomain',
#   'R_emaildomain',
  'C1',
  'C2',
  'C3',
  'C4',
  'C5',
  'C6',
  'C7',
  'C8',
  'C9',
  'C10',
  'C11',
  'C12',
  'C13',
  'C14',
  'D1',
  'D2',
  'D3',
  'D4',
  'D5',
  'D6',
  'D7',
  'D8',
  'D9',
  'D10',
  'D11',
  'D12',
  'D13',
  'D14',
  'D15',
  'M1',
  'M2',
  'M3',
  'M4',
  'M5',
  'M6',
  'M7',
  'M8',
  'M9'
  # ignore their features for now
]

In [13]:
with open('./data/train_transaction_clean.pkl', 'rb') as train_file:
    transactions_train = pickle.load(train_file)

with open('./data/test_transaction_clean.pkl', 'rb') as test_file:
    transactions_test = pickle.load(test_file)

In [14]:
def features(cleaned_transactions):
    features = cleaned_transactions.loc[:, FEATURE_COLUMNS]
    return features.fillna(features.median())

In [15]:
def write_predictions(
    transaction_ids,
    predictions,
    filename='predictions'
):
    if len(transaction_ids) != len(predictions):
        raise TypeError("labels don't match predictions")
    with open(f'./data/submissions/{filename}.csv', 'w') as predictions_file:
        predictions_file.write(PREDICTION_FILE_HEADERS)
        for transaction_id, prediction in zip(transaction_ids, predictions):
            predictions_file.write(f'{transaction_id},{prediction}\n')

In [16]:
train_features = features(transactions_train)
train_targets = transactions_train.isFraud
del transactions_train

In [60]:
pipeline_optimizer = TPOTClassifier(
    generations=2,
    population_size=4,
    cv=3,
    random_state=951,
    verbosity=2
)

In [None]:
# 2/2/3 1_000 -> 3.4s
# 2/2/3 10_000 -> 34.6s
# 4/2/3 1_000 -> 4.8s
# 4/10/3 1_000 -> 27.8s
# 6/12/5 1_000 -> 45.2s
# 1/1/2 all -> 1258

In [61]:
start_time = time.time()
pipeline_optimizer.fit(train_features[:10_000], train_targets[:10_000])
print(f'runtime: {time.time() - start_time}s')

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=12, style=ProgressStyle(descripti…

Generation 1 - Current best internal CV score: 0.9763001795080527
Generation 2 - Current best internal CV score: 0.9767002195120531

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.5, min_samples_leaf=20, min_samples_split=6, n_estimators=100)
runtime: 34.758265018463135s


In [124]:
test_features = features(transactions_test)
predictions = pipeline_optimizer.predict(test_features)
write_predictions(test_features.index, predictions, 'linear_model_1')

# lm = linear_model.LinearRegression()
# lm.fit(train_features, train_targets)