In [1]:
import numpy as np
import pandas as pd
import glob 
import os
import datetime
pd.options.display.max_columns=50

In [2]:
# results = glob.glob('./results/query_indices/*')
# list1, list2 = zip(*sorted(zip([os.stat(result).st_size for result in results], results))) 
# df = pd.read_csv(list2[-15], skiprows = 1)

### Result of Hybrid Experiments

In [3]:
import csv

run_id_saver = [('hybrid', 1602420493.392)]
results = {}
num_weeks = 5 # ToDo: to be automatically determined

for strategy, run_id in run_id_saver:
    results[run_id] = {}
    for week in range(num_weeks):
        filename = f'results/query_indices/{run_id}/{run_id}-{strategy}-5.0-scratch-week-{week}.csv'
        with open(filename, "r") as f:
            reader = csv.reader(f, delimiter=",")
            expid = next(reader)[1]
            dataset = next(reader)[1]
            episode = next(reader)[1]
            start_day = next(reader)[1]
            end_day = next(reader)[1]
            
            start_day = datetime.date(int(start_day[:4]), int(start_day[5:7]), int(start_day[8:10])).strftime('%y-%m-%d')
            end_day = datetime.date(int(end_day[:4]), int(end_day[5:7]), int(end_day[8:10])).strftime('%y-%m-%d')
            
            if week == 0:
                if dataset == 'real-n':
                    df = pd.read_csv('data/ndata.csv')
                elif dataset == 'real-t':
                    df = pd.read_csv('data/tdata.csv')
                elif dataset == 'real-m':
                    df = pd.read_csv('data/mdata.csv')
                elif dataset == 'synthetic':
                    df = pd.read_csv('data/synthetic-imports-declarations.csv')
                
            alldata = df[(df['sgd.date'] < end_day) & (df['sgd.date'] >= start_day)].loc[:, ['illicit', 'revenue']]
            
            while True:
                try:
                    indices = next(reader)
                    samp = indices[0]
                    indices = indices[1:]
                    indices = list(map(int, indices))

                    if week == 0:
                        results[run_id][f'{samp}-pre'] = []
                        results[run_id][f'{samp}-rec'] = []
                        results[run_id][f'{samp}-rev'] = []

                    chosen = df.iloc[indices].loc[:, ['illicit', 'revenue']]
                    # Recall and revenue
                    if chosen.empty:
                        pre = rec = rev = 0
                    else:
                        pre = sum(chosen['illicit'])/chosen['illicit'].count()
                        rec = sum(chosen['illicit'])/sum(alldata['illicit'])
                        rev = sum(chosen['revenue'])/sum(alldata['revenue'])

                    print(f'Week {week}, subsampler {samp}: Precision {round(pre, 4)} Recall {round(rec, 4)}, Revenue {round(rev, 4)}')
                    results[run_id][f'{samp}-pre'].append(pre)
                    results[run_id][f'{samp}-rec'].append(rec)
                    results[run_id][f'{samp}-rev'].append(rev)
                
                except StopIteration:
                    break

Week 0, subsampler DATE: Precision 0.0506 Recall 0.0571, Revenue 0.0265
Week 0, subsampler badge: Precision 0.0286 Recall 0.0036, Revenue 0.0019
Week 1, subsampler DATE: Precision 0.0512 Recall 0.0588, Revenue 0.0758
Week 1, subsampler badge: Precision 0.087 Recall 0.0112, Revenue 0.0237
Week 2, subsampler DATE: Precision 0.1105 Recall 0.1238, Revenue 0.0889
Week 2, subsampler badge: Precision 0.1795 Recall 0.0222, Revenue 0.005
Week 3, subsampler DATE: Precision 0.0382 Recall 0.0379, Revenue 0.0014
Week 3, subsampler badge: Precision 0.0789 Recall 0.0087, Revenue 0.0001
Week 4, subsampler DATE: Precision 0.0226 Recall 0.0346, Revenue 0.0044
Week 4, subsampler badge: Precision 0.0455 Recall 0.0077, Revenue 0.0004


In [4]:
run_id = 1602420493.392
pd.DataFrame.from_dict(results[run_id])

Unnamed: 0,DATE-pre,DATE-rec,DATE-rev,badge-pre,badge-rec,badge-rev
0,0.050633,0.057143,0.026515,0.028571,0.003571,0.001932
1,0.05122,0.058824,0.075837,0.086957,0.011204,0.023658
2,0.110482,0.12381,0.088867,0.179487,0.022222,0.00498
3,0.038235,0.037901,0.001412,0.078947,0.008746,5.2e-05
4,0.022556,0.034615,0.004383,0.045455,0.007692,0.00042


### Result to find novel frauds

In [5]:
run_id_saver = [('hybrid', 1602420493.392)]

novelty = {}
num_weeks = 5 # ToDo: to be automatically determined
old_IID = set()

for strategy, run_id in run_id_saver:
    novelty[run_id] = {}
    for week in range(num_weeks):
        filename = f'results/query_indices/{run_id}/{run_id}-{strategy}-5.0-scratch-week-{week}.csv'
        with open(filename, "r") as f:
            reader = csv.reader(f, delimiter=",")
            expid = next(reader)[1]
            dataset = next(reader)[1]
            episode = next(reader)[1]
            start_day = next(reader)[1]
            end_day = next(reader)[1]
            
            start_day = datetime.date(int(start_day[:4]), int(start_day[5:7]), int(start_day[8:10])).strftime('%y-%m-%d')
            end_day = datetime.date(int(end_day[:4]), int(end_day[5:7]), int(end_day[8:10])).strftime('%y-%m-%d')
            
            if week == 0:
                if dataset == 'real-n':
                    df = pd.read_csv('data/ndata.csv')
                elif dataset == 'real-t':
                    df = pd.read_csv('data/tdata.csv')
                elif dataset == 'real-m':
                    df = pd.read_csv('data/mdata.csv')
                elif dataset == 'synthetic':
                    df = pd.read_csv('data/synthetic-imports-declarations.csv')
                
            alldata = df[(df['sgd.date'] < end_day) & (df['sgd.date'] >= start_day)].loc[:, ['illicit', 'revenue', 'importer.id']]
            alldata = alldata[~alldata['importer.id'].isin(old_IID)]
            
            if alldata.empty:
                continue
            
            while True:
                try:
                    indices = next(reader)
                    samp = indices[0]
                    indices = indices[1:]
                    indices = list(map(int, indices))

                    if week == 0:
                        novelty[run_id][f'{samp}-pre'] = []
                        novelty[run_id][f'{samp}-rec'] = []
                        novelty[run_id][f'{samp}-rev'] = []

                    chosen = df.iloc[indices]
                    chosen = chosen[~chosen['importer.id'].isin(old_IID)]
                    
                    # Recall and revenue
                    if chosen.empty:
                        pre = rec = rev = 0
                    else:
                        pre = sum(chosen['illicit'])/chosen['illicit'].count()
                        rec = sum(chosen['illicit'])/sum(alldata['illicit'])
                        rev = sum(chosen['revenue'])/sum(alldata['revenue'])

                    print(f'Week {week}, subsampler {samp}: Precision {round(pre, 4)} Recall {round(rec, 4)}, Revenue {round(rev, 4)}')
                    novelty[run_id][f'{samp}-pre'].append(pre)
                    novelty[run_id][f'{samp}-rec'].append(rec)
                    novelty[run_id][f'{samp}-rev'].append(rev)
                
                    old_IID = old_IID.union(set(alldata['importer.id'].values))
                    
                except StopIteration:
                    break


Week 0, subsampler DATE: Precision 0.0506 Recall 0.0571, Revenue 0.0265
Week 0, subsampler badge: Precision 0.0357 Recall 0.0036, Revenue 0.0019
Week 1, subsampler DATE: Precision 0.0449 Recall 0.0591, Revenue 0.0493
Week 1, subsampler badge: Precision 0.0857 Recall 0.0127, Revenue 0.0375
Week 2, subsampler DATE: Precision 0.1287 Recall 0.1444, Revenue 0.1356
Week 2, subsampler badge: Precision 0.2632 Recall 0.0278, Revenue 0.0109
Week 3, subsampler DATE: Precision 0.04 Recall 0.0536, Revenue 0.0
Week 3, subsampler badge: Precision 0.0476 Recall 0.006, Revenue 0.0
Week 4, subsampler DATE: Precision 0.0118 Recall 0.0309, Revenue 0.0162
Week 4, subsampler badge: Precision 0.037 Recall 0.0103, Revenue 0.0018


In [6]:
run_id = 1602420493.392
pd.DataFrame.from_dict(novelty[run_id])

Unnamed: 0,DATE-pre,DATE-rec,DATE-rev,badge-pre,badge-rec,badge-rev
0,0.050633,0.057143,0.026515,0.035714,0.003571,0.001932
1,0.044872,0.059072,0.049329,0.085714,0.012658,0.037461
2,0.128713,0.144444,0.135574,0.263158,0.027778,0.010897
3,0.04,0.053571,0.0,0.047619,0.005952,0.0
4,0.011765,0.030928,0.016174,0.037037,0.010309,0.001826
