In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.model_selection import LeaveOneOut, cross_val_predict
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [3]:
df_cc = pd.read_csv("data/creditcard.csv")
df_cc.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df_rel = pd.read_csv("data/relations.csv", header=None, names=(["from_id", "distance", "inverse", "priority", "to_id", "type"]))
df_rel.head()

Unnamed: 0,from_id,distance,inverse,priority,to_id,type
0,0,0.0,0.0,0,0,0
1,0,1.57631,0.634393,1,107821,SIMILAR_TO
2,0,2.148596,0.46542,2,41901,SIMILAR_TO
3,0,2.814823,0.355262,3,22656,SIMILAR_TO
4,0,2.918754,0.342612,4,96012,SIMILAR_TO


In [5]:
df_from = pd.DataFrame({"from_id": list(df_cc.index), "from_class": df_cc.Class})
df_to = pd.DataFrame({"to_id": list(df_cc.index), "to_class": df_cc.Class})
df_rel = df_rel.merge(df_from, how="left")
df_rel = df_rel.merge(df_to, how="left")
df_rel = df_rel[df_rel["from_id"] != df_rel["to_id"]]
df_rel.head()

Unnamed: 0,from_id,distance,inverse,priority,to_id,type,from_class,to_class
1,0,1.57631,0.634393,1,107821,SIMILAR_TO,0,0
2,0,2.148596,0.46542,2,41901,SIMILAR_TO,0,0
3,0,2.814823,0.355262,3,22656,SIMILAR_TO,0,0
4,0,2.918754,0.342612,4,96012,SIMILAR_TO,0,0
5,0,2.937022,0.340481,5,66118,SIMILAR_TO,0,0


In [6]:
def fractionise(d, k, df):

    if k != None:
        df = df[df['priority'] <= k]
    elif d != None:
        df = df[df['distance'] <= d]
    else:
        df = df

    df = df.groupby(['from_id','from_class', 'to_class']).size().unstack(fill_value=0)
    df['fraction'] = df[1] / (df[0] + df[1])
    df = df.reset_index()

    df_index = pd.DataFrame({'from_id': df_cc.index})
    df = df_index.merge(df, how='left')

    df = df.fillna(0)

    return df['fraction']

In [7]:
def combine_metrics(d, k, df_relations, df_credit, topology):

    if k != None:
        df_relations = df_relations[df_relations["priority"] <= k]
        type = f"k{k}"
    elif d != None:
        df_relations = df_relations[df_relations["distance"] <= d]
        type = f"d{d}"
    else:
        df_relations = df_relations

    df = pd.DataFrame({"id": list(df_credit.index), "class": df_credit.Class})
    df["mean"] = df_relations.groupby(["from_id"])[["distance"]].mean()
    df["std"] = df_relations.groupby(["from_id"])[["distance"]].std()
    df["min"] = df_relations.groupby(["from_id"])[["distance"]].min()
    df["max"] = df_relations.groupby(["from_id"])[["distance"]].max()

    df["frac"] = fractionise(d=d, k=k, df=df_relations)

    df["amount"] = df_cc["Amount"]

    if topology == True:
        df["lcc"] = pd.read_csv(f"topology/lcc_{type}.csv")["lcc"]
        df["pr"] = pd.read_csv(f"topology/pr_{type}.csv")["PageRank"]
        df["ev"] = pd.read_csv(f"topology/ev_{type}.csv")["eigenvector"]
        #df["dw"] = pd.read_csv(f"topology/dw_{type}.csv")["degree"]
    else:
        pass

    df = df.fillna(0)

    return df

In [8]:
top_k = [2**exp for exp in range(1, 9)]

for k in top_k:
    df_met = combine_metrics(d=None, k=k, df_relations=df_rel, df_credit=df_cc, topology=False)

    df_met["log_std"] = np.log(df_met["std"])
    df_met["log_amount"] = np.log(df_met["amount"] + 1)
    df_met[np.isneginf(df_met)] = 0

    df_met.iloc[:, 2:] = StandardScaler().fit_transform(df_met.iloc[:, 2:])
    df_met.to_csv(f"results/metrics/metrics_scale_k{k}.csv", index=False)

In [8]:
df_met.head()

Unnamed: 0,id,class,mean,std,min,max,frac,amount,log_std,log_amount
0,0,0,-0.062856,-0.107007,-0.014203,-0.073559,-0.045225,0.244964,-0.521719,1.124303
1,1,0,-0.114042,-0.12623,-0.061865,-0.120827,-0.045225,-0.342475,-1.013388,-1.114639
2,2,0,0.084726,0.039915,0.078157,0.076624,-0.045225,1.160686,0.95777,1.682368
3,3,0,-0.015963,-0.048163,-0.032288,-0.024166,-0.045225,0.140534,0.306786,1.009339
4,4,0,-0.03543,-0.094164,0.039437,-0.048029,-0.045225,-0.073403,-0.282381,0.670241


In [9]:
top_k = [2**exp for exp in range(1, 9)]

df_t = pd.DataFrame({"k": top_k})
df_t = df_t.reindex(columns = df_t.columns.tolist() + ["mean", "std", "min", "max", "frac"])
df_p = df_t.copy()

In [10]:
for k in top_k:
    df_met = combine_metrics(d=None, k=k, df_relations=df_rel, df_credit=df_cc)
    df_met = df_met.drop(["id"], axis=1)

    df_met[df_met == np.inf] = 0

    for col in df_t.drop(["k"], axis=1).columns:
        data1 = df_met[df_met["class"] == 0][f"{col}"]
        data2 = df_met[df_met["class"] == 1][f"{col}"]
        t, p = ttest_ind(data1, data2, equal_var=False, alternative="less")

        df_t.loc[df_t["k"] == k, f"{col}"] = t
        df_p.loc[df_p["k"] == k, f"{col}"] = p

In [11]:
df_t.to_csv("results/significance/t-value_distance_fraction_less.csv", index=False)
df_p.to_csv("results/significance/p-value_distance_fraction_less.csv", index=False)

In [12]:
top_k = [2**exp for exp in range(1, 9)]

df_t = pd.DataFrame({"k": top_k})
df_t = df_t.reindex(columns = df_t.columns.tolist() + ["lcc", "pr", "ev"])
df_p = df_t.copy()

In [13]:
for k in top_k:
    df_met = combine_metrics(d=None, k=k, df_relations=df_rel, df_credit=df_cc)
    df_met = df_met.drop(["id"], axis=1)

    df_met[df_met == np.inf] = 0

    for col in df_t.drop(["k"], axis=1).columns:
        data1 = df_met[df_met["class"] == 0][f"{col}"]
        data2 = df_met[df_met["class"] == 1][f"{col}"]
        t, p = ttest_ind(data1, data2, equal_var=False, alternative="two-sided")

        df_t.loc[df_t["k"] == k, f"{col}"] = t
        df_p.loc[df_p["k"] == k, f"{col}"] = p

In [14]:
df_t.to_csv("results/significance/t-value_topology_less.csv", index=False)
df_p.to_csv("results/significance/p-value_topology_less.csv", index=False)

In [15]:
df_met = combine_metrics(d=None, k=256, df_relations=df_rel, df_credit=df_cc, topology=False)

df_met["log_std"] = np.log(df_met["std"])
df_met["log_amount"] = np.log(df_met["amount"] + 1)

df_met.head()

Unnamed: 0,id,class,mean,std,min,max,frac,amount,log_std,log_amount
0,0,0,4.715419,0.758605,1.57631,5.665165,0.0,149.62,-0.276274,5.01476
1,1,0,0.667401,0.49152,2.315799e-07,1.336406,0.0,2.69,-0.710253,1.305626
2,2,0,16.386885,2.79999,4.630892,19.418667,0.0,378.66,1.029616,5.939276
3,3,0,8.423946,1.576199,0.9781743,10.18851,0.0,123.5,0.455016,4.824306
4,4,0,6.884413,0.937049,3.350314,8.003181,0.0,69.99,-0.065019,4.262539


In [16]:
#distance threshold
top_k = [2**exp for exp in range(1, 9)]
#number of vector embeddings
polynomial = range(1, 3)
#iteration
iteration = range(1, 51)
#outcome variables
recall = [None]
precision = [None]
AUC = [None]

lp1, lp2, lp3, lp4, lp5, lp6 = pd.core.reshape.util.cartesian_product([top_k, polynomial, iteration, recall, precision, AUC])
bm = pd.DataFrame(dict(k=lp1, p=lp2, i=lp3, recall=lp4, precision=lp5, AUC=lp6))

In [17]:
variable = [["frac"], ["frac", "std"], ["frac", "std", "amount"], ["frac", "log_std"], ["frac", "log_std", "log_amount"]]

for v in variable:
    var_name = '_'.join(v)

    for k in top_k:
        df_met = combine_metrics(d=None, k=k, df_relations=df_rel, df_credit=df_cc, topology=False)

        df_met["log_std"] = np.log(df_met["std"])
        df_met["log_amount"] = np.log(df_met["amount"] + 1)

        for i in iteration:

            fraud_train = df_met[df_met["class"] == 1]
            legit_train = df_met[df_met["class"] == 0].sample(492)
            subset = pd.concat([fraud_train, legit_train])

            X = subset.loc[:, v]
            y = subset['class']

            X[np.isneginf(X)] = 0

            scaler = StandardScaler().fit(X)
            X = scaler.transform(X)

            for p in polynomial:
                poly = PolynomialFeatures(degree=p, interaction_only=False, include_bias=False)
                X = poly.fit_transform(X)

                loocv = LeaveOneOut()
                model = LogisticRegression(penalty="none", solver="lbfgs", max_iter=10000)

                y_pred = cross_val_predict(model, X, y, cv=loocv, method='predict')
                bm.loc[(bm["k"] == k) & (bm["p"] == p) & (bm["i"] == i), "recall"] = recall_score(y, y_pred)
                bm.loc[(bm["k"] == k) & (bm["p"] == p) & (bm["i"] == i), "precision"] = precision_score(y, y_pred)

                y_pred = cross_val_predict(model, X, y, cv=loocv, method='predict_proba')[:,1]
                bm.loc[(bm["k"] == k) & (bm["p"] == p) & (bm["i"] == i), "AUC"] = roc_auc_score(y, y_pred)

    bm.to_csv(f"results/bm_log2_model_{var_name}.csv", index=False)

In [19]:
fraud_train = df_met[df_met["class"] == 1]
legit_train = df_met[df_met["class"] == 0].sample(492)
subset = pd.concat([fraud_train, legit_train])

X = subset.loc[:, ["frac", "log_std", "log_amount"]]
y = subset['class']

X[np.isneginf(X)] = 0

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X = poly.fit_transform(X)

loocv = LeaveOneOut()
model = LogisticRegression(penalty="none", solver="lbfgs", max_iter=10000)

y_pred = cross_val_predict(model, X, y, cv=loocv, method='predict_proba')[:,1]

In [20]:
predictions = pd.DataFrame({"prediction": y_pred})
output = pd.concat([subset.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)
output

Unnamed: 0,id,class,mean,std,min,max,frac,amount,log_std,log_amount,prediction
0,541,1,61.946367,18.865323,18.971895,77.619972,0.378906,0.00,2.937325,0.000000,1.000000
1,623,1,23.635497,2.849756,10.556773,26.878016,0.000000,529.00,1.047233,6.272877,0.267709
2,4920,1,82.593451,15.933916,33.111908,95.224800,0.281250,239.93,2.768450,5.484506,1.000000
3,6108,1,191.980662,69.300368,28.421581,307.939301,0.792969,59.00,4.238450,4.094345,0.999977
4,6329,1,74.039379,20.798343,8.838511,92.735092,0.078125,1.00,3.034873,0.693147,0.999975
...,...,...,...,...,...,...,...,...,...,...,...
979,43383,0,2.643348,0.960356,0.023786,4.147828,0.000000,84.00,-0.040451,4.442651,0.072671
980,142905,0,12.287047,1.774945,4.351657,14.153531,0.000000,0.76,0.573769,0.565314,0.263772
981,111181,0,2.617977,0.966008,0.009770,3.758038,0.000000,0.01,-0.034584,0.009950,0.233156
982,85931,0,4.699490,1.259506,1.332461,6.198644,0.000000,9.48,0.230720,2.349469,0.058912


In [21]:
thresholds = range(1, 101)
thresholds = [threshold / 100 for threshold in thresholds]
theft = [None]
FN = [None]

lp1, lp2, lp3 = pd.core.reshape.util.cartesian_product([thresholds, theft, FN])
bm = pd.DataFrame(dict(t=lp1, theft=lp2, false_negative=lp3))

In [22]:
for t in thresholds:

    output.loc[output["prediction"] >= t, "class_predict"] = int(1)
    output.loc[output["prediction"] <  t, "class_predict"] = int(0)

    output["loss_amount"] = 0
    output.loc[(output["class"] ==  1) & (output["class_predict"] ==  0), "loss_amount"] = 1

    output["false_negative"] = 0
    output.loc[(output["class"] ==  0) & (output["class_predict"] ==  1), "false_negative"] = 1

    output["loss_amount"] = output["loss_amount"] * output["amount"]

    bm.loc[bm["t"] == t, "theft"] = output["loss_amount"].sum()
    bm.loc[bm["t"] == t, "false_negative"] = output["false_negative"].sum()

In [24]:
bm

Unnamed: 0,t,theft,false_negative
0,0.01,0.0,483
1,0.02,0.0,471
2,0.03,0.0,456
3,0.04,4.49,428
4,0.05,77.16,370
...,...,...,...
95,0.96,14892.48,2
96,0.97,15180.6,2
97,0.98,15378.86,2
98,0.99,15411.17,1


In [25]:
bm.to_csv(f"results/bm_theft_safe.csv", index=False)