In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('HI-Small_Trans.csv')

def mapValueToCategories(df, processed_df, columns):
    unique_values = pd.Series(df[columns].values.ravel('K')).unique()
    value_to_category_mapping = {value: category for category, value in enumerate(unique_values)}
    for column in columns:
        processed_df[column] = df[column].map(value_to_category_mapping)

In [2]:
processed_df = pd.DataFrame()
df['FromBankAcc'] = df.iloc[:,1].astype(str) + '_' + df.iloc[:,2]
df['ToBankAcc'] = df.iloc[:,3].astype(str) + '_' + df.iloc[:,4]
mapValueToCategories(df, processed_df, ['FromBankAcc', 'ToBankAcc'])
mapValueToCategories(df, processed_df, ['From Bank', 'To Bank'])
mapValueToCategories(df, processed_df, ['Receiving Currency'])
processed_df['xcur'] = df['Receiving Currency'] != df['Payment Currency']
mapValueToCategories(df, processed_df, ['Payment Format'])
processed_df['fee'] = (df['Amount Paid'] - df['Amount Received']).apply(lambda x: 0 if x == 0 else 1)
processed_df['TimestampCategory'] = pd.cut(pd.to_datetime(df['Timestamp']), bins=240, labels=False, right=False)
processed_df['Amount Received'] = df['Amount Received']
#processed_df['Amount Received'] = df['Amount Paid']
processed_df['Is Laundering'] = df['Is Laundering']

#print(processed_df)

In [10]:
from sklearn.utils import resample
from sklearn.preprocessing import quantile_transform

x, y = processed_df.iloc[:, :-1], processed_df.iloc[:, [-1]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

train_df = pd.concat([pd.DataFrame(x_train), pd.DataFrame(y_train)],axis=1)
class_counts = train_df['Is Laundering'].value_counts()
df_majority = train_df[train_df['Is Laundering'] == 0]
df_minority = train_df[train_df['Is Laundering'] == 1]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=class_counts[1])
df_balanced = pd.concat([df_majority_downsampled, df_minority])
#print(df_balanced)
x_train, y_train = df_balanced.iloc[:, :-1], df_balanced.iloc[:, [-1]]

x_train_scaled = quantile_transform(x_train, n_quantiles=10, random_state=0, copy=True)


In [11]:
clf = LogisticRegression(random_state=0).fit(x_train_scaled, y_train.values.ravel())

In [12]:
from sklearn.metrics import confusion_matrix, f1_score, roc_curve, auc

x_test_scaled = quantile_transform(x_test, n_quantiles=10, random_state=0, copy=True)
for i in range(1,10):
    y_pred_new_thresh = (clf.predict_proba(x_test_scaled)[:,1] >= i / 10.0).astype(int)

    CM = confusion_matrix(y_test, y_pred_new_thresh)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    print('\nthreshold: ', i / 10.0)
    print('TN: ', TN, '\nFN: ', FN, '\nTP: ', TP, '\nFP: ', FP)
    print('F1 score: ', f1_score(y_test, y_pred_new_thresh))

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_new_thresh)
    oc_auc = auc(fpr, tpr)
    print('auc: ', oc_auc)



threshold:  0.1
TN:  73240 
FN:  0 
TP:  1087 
FP:  941342
F1 score:  0.002304147465437788
auc:  0.5360936819300953

threshold:  0.2
TN:  178357 
FN:  8 
TP:  1079 
FP:  836225
F1 score:  0.0025739780126456512
auc:  0.5842169362165864

threshold:  0.3
TN:  286826 
FN:  18 
TP:  1069 
FP:  727756
F1 score:  0.002929120222711779
auc:  0.63307213912342

threshold:  0.4
TN:  401097 
FN:  37 
TP:  1050 
FP:  613485
F1 score:  0.0034111841357196463
auc:  0.6806468132292881

threshold:  0.5
TN:  523336 
FN:  59 
TP:  1028 
FP:  491246
F1 score:  0.00416733385897953
auc:  0.7307682828062825

threshold:  0.6
TN:  644220 
FN:  95 
TP:  992 
FP:  370362
F1 score:  0.005327018239130493
auc:  0.7737822472884393

threshold:  0.7
TN:  760683 
FN:  167 
TP:  920 
FP:  253899
F1 score:  0.00719014012957883
auc:  0.7980581443814992

threshold:  0.8
TN:  873923 
FN:  362 
TP:  725 
FP:  140659
F1 score:  0.010177509809013763
auc:  0.7641679657410434

threshold:  0.9
TN:  973353 
FN:  903 
TP:  184 
FP: 