In [1]:
print('Downloading data locally')
#![ -f "accounts.csv" ] && echo "Already downloaded" || curl https://raw.githubusercontent.com/PaulMercerAI/AML19/master/data/20191111_Accounts.csv.xz -L | xzcat > accounts.csv
![ -f "transactions.csv" ] && echo "Already downloaded" || curl https://raw.githubusercontent.com/PaulMercerAI/AML19/master/data/20191111_Transactions.csv.xz -L | xzcat > transactions.csv

Downloading data locally
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5022k  100 5022k    0     0   692k      0  0:00:07  0:00:07 --:--:-- 1343k


In [0]:
import pandas as pd #Functions to manage multidimensional structured data sets
import numpy as np  #Fast numeric functions

In [0]:
tx = pd.read_csv("transactions.csv")

In [0]:
#Get just the sender accounts and amounts where we don't know if the transactions are suspicious
txUnknown = tx[tx.is_suspx!='Unknown'][['sender_account', 'amount']]

In [5]:
# Create a function using https://docs.scipy.org/doc/numpy/reference/generated/numpy.std.html and np.mean
# to get a normalised standard deviation for an array of transactions
def nsd(a):
  return np.std(a) / np.mean(a) #normalisedStandardDeviation of amount

txAmountOutliers = txUnknown.groupby(txUnknown.sender_account).aggregate(nsd)
txAmountOutliers.columns = ['normalisedStandardDeviation']
txAmountOutliers.head(2)

Unnamed: 0_level_0,normalisedStandardDeviation
sender_account,Unnamed: 1_level_1
000cd21a-65a7-4a30-b5b7-0dd9728f5897,0.092881
001e7006-29d3-44d6-9a59-6f63141ec659,0.20967


In [0]:
#Get the transaction sender_account, id and amount with the sender_account being the index
#to make it quick and easy to join these with the outliers
txSummary = tx[['sender_account', 'id', 'amount']].set_index('sender_account')

In [0]:
#Model assumes the riskiest 1000 are the ones with the highest normalised standard deviation amount, then amount
txRiskiest = txSummary.join(txAmountOutliers).nlargest(1000, ['normalisedStandardDeviation', 'amount'])

In [0]:
#Model assumes there is definitely an outlier at the top that is money laundering, and after the top 1000 nothing is money laundering
#and the probability scale is linear in between
txRiskiest['P'] = np.arange(1.0,0.0, -0.001)

#but we don't know what the behaviour is of the laundering, so make a guess
txRiskiest['alert_type'] = "A1"

#remove the fields we are not submitting
txRiskiest=txRiskiest[['id', 'P', 'alert_type']].set_index('id')

In [0]:
#Save to file
txRiskiest.to_csv('result.tsv', sep='\t')

In [10]:
#Sanity check
!head result.tsv

id	P	alert_type
474140	1.0	A1
408881	0.999	A1
343413	0.998	A1
278611	0.997	A1
83017	0.996	A1
148219	0.995	A1
213217	0.994	A1
17218	0.993	A1
245367	0.992	A1


Download the ipynb notebook from the File menu and submit as the solution along with result.tsv at https://aml19.herokuapp.com/