In [1]:
import os
import lightgbm as lgbm

from copy import deepcopy
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from lib.store import Store
from lib.naming_corrections import (
    FEATURE_COLUMNS_OTHERS,
    FEATURES_NAMES_FROM_NEW_CACHE,
    FEATURES_NAMES_FROM_PRELOADED_CACHE,
    TABLES_COLUMNS_DEFAULT_LEGACY,
    TABLES_V5_2_V4_RENAME_LEGACY,
)

FEATURE_COLUMNS = FEATURES_NAMES_FROM_PRELOADED_CACHE + FEATURE_COLUMNS_OTHERS
TABLES_V5_2_V4_RENAME = TABLES_V5_2_V4_RENAME_LEGACY
TABLES_COLUMNS_DEFAULT = TABLES_COLUMNS_DEFAULT_LEGACY

In [2]:
def fit_scaler(graph_data, cached_features_dir, datastore):
    '''Fit the scalar for each model'''
    print ("Fitting Scaler...")
    scaler = StandardScaler()
    for p in tqdm(graph_data):
        df_f = pd.read_parquet(
                    datastore.open_file(os.path.join(cached_features_dir, f"features_{p}.parquet"))
                )
            
        X = df_f[
            FEATURE_COLUMNS
        ].fillna(value=0.).values
        scaler.partial_fit(X)
    return scaler

def read_from_cache(features_partition_filepath, edges_partition_filepath, datastore):
    df_f = pd.read_parquet(
        datastore.open_file(features_partition_filepath)
    )
    df_e = pd.read_parquet(
        datastore.open_file(edges_partition_filepath)
    )
    return df_f, df_e

def augment_labels(y, rng, semi_supervised, semi_supervised_resample_negs=None, semi_supervised_resample_factor=None):
    '''Augment the labels'''
    if semi_supervised == False:
        _idx, = np.where(y == 2)
        y[_idx] = 0
    elif semi_supervised_resample_negs is None:
        # dont do anything
        pass
    elif (
        (semi_supervised_resample_negs == 'random')
        or
        (semi_supervised_resample_negs == 'candidates')
    ):
        if semi_supervised_resample_negs == 'candidates':
            raise NotImplementedError("neg-candidates not implemented for loader_v3")
        else:
            # randomize the 0 and 2 labels
            _idx, = np.where((y == 2) | (y==0))
            y[_idx] = 2 # unsup

        _n = max((y==1).sum(), 1) # at least 1

        for i in rng.choice(
            range(len(_idx)),
            size=min(
                len(_idx), 
                _n * semi_supervised_resample_factor
            ), 
            replace=False,
        ):
            y[_idx[i]] = 0 # neg class
    return y

def load_local_data_store(data_dir):   
    # build the store
    store = Store(
        base_dir=data_dir,
        protocol='file'
    )
    return store

In [3]:
# Define the directories
data_dir = "data/datasetMHRW"
labels_dir = "labels"
partitions_dir = "partitions.parquet"
cached_features_dir = "cache/features"
cached_edges_dir = "cache/edges"

# Load the datastore
datastore = load_local_data_store(data_dir)

# Read the partition
df_p = pd.read_parquet(
    datastore.open_file(partitions_dir)
).reset_index(drop=True).reset_index()

# Make the counters for selecting the splits
counters = {}
graph_data = {}
labelled = {}
for sp, A in df_p.groupby('split'):
    graph_data[sp] = {
        x: None
        for x in sorted(A['index'])
    }
    labelled[sp] = {
        x: None
        for x in sorted(A['index'])
    }
    counters[sp] = Counter()

# Fit the scalarA
scaler = fit_scaler(graph_data["train"],
                    cached_features_dir=cached_features_dir,
                    datastore=datastore)

Fitting Scaler...


  0%|          | 0/13357 [00:00<?, ?it/s]

In [4]:
print("Loading Cached Data for Training...")

# Define the splits
rng=np.random.default_rng(seed=1)
splits = ["train", "val", "test"]

# Make the splits for data collection
feature_dict = {"train" : [], "val": [], "test": []}
label_dict = {"train" : [], "val": [], "test": []}
for sp in splits:
    for p in tqdm(graph_data[sp]):
        # Load the files
        labels_partition_filepath = os.path.join(labels_dir, f"labels_{p}.parquet")
        features_partition_filepath = os.path.join(cached_features_dir, f"features_{p}.parquet")
        edges_partition_filepath = os.path.join(cached_edges_dir, f"edges_{p}.parquet")

        # Load the partition
        df_l = pd.read_parquet(
            datastore.open_file(labels_partition_filepath)
        )
             
        df_f, df_e = read_from_cache(features_partition_filepath,
                                     edges_partition_filepath,
                                     datastore)

        X = df_f[
            FEATURE_COLUMNS
        ].fillna(value=0.).values

        # Scale the values
        X = scaler.transform(X)

        # need to ensure ordering is same
        y = df_f[['txid']].merge(
            df_l[['txid', 'label']],
        )['label'].values

        # uses a negative sampling strategy
        y=augment_labels(
            y, 
            rng, 
            semi_supervised=True, 
            semi_supervised_resample_negs=None, 
            semi_supervised_resample_factor=None
        )

        # Change all labels and find indexes
        labelled[sp][p], = np.where(
            y != 2
        )

        # Update the counter
        counters[sp].update(y)

        # Build the ground Data
        feature_dict[sp].append(X)
        label_dict[sp].append(y)

Loading Cached Data for Training...


  0%|          | 0/13357 [00:00<?, ?it/s]

  0%|          | 0/1242 [00:00<?, ?it/s]

  0%|          | 0/1225 [00:00<?, ?it/s]

In [5]:
# Train
train_X = np.concatenate(feature_dict['train'])
train_y = np.concatenate(label_dict['train'])
print(f"Train shape : {train_X.shape}")

# Validation
val_X = np.concatenate(feature_dict['val'])
val_y = np.concatenate(label_dict['val'])
print(f"Val shape : {val_X.shape}")

# Test
test_X = np.concatenate(feature_dict['test'])
test_y = np.concatenate(label_dict['test'])
print(f"Test shape : {test_X.shape}")

# Create the dataset
train_dataset = lgbm.Dataset(train_X, train_y)
val_dataset = lgbm.Dataset(val_X, val_y)
test_dataset = lgbm.Dataset(test_X, test_y)

Train shape : (2192594, 51)
Val shape : (206537, 51)
Test shape : (206343, 51)


In [12]:
# Define the params
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'is_unbalance': True,
    'metric': 'auc',
    'is_training_metric': True,
    'learning_rate': 1e-3,
    'n_jobs': 5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1,
    'force_col_wise': 'true',
    'is_unbalance': 'true'
}

# Train the model
model = lgbm.train(params=params,
                   num_boost_round=1000,
                   train_set=train_dataset,
                   valid_sets=[train_dataset, val_dataset, test_dataset],
                   callbacks=[lgbm.early_stopping(stopping_rounds=20)])
print("Model Training Completed.....")

[LightGBM] [Info] Number of positive: 10438, number of negative: 2182156
[LightGBM] [Info] Total Bins 11726
[LightGBM] [Info] Number of data points in the train set: 2192594, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004761 -> initscore=-5.342616
[LightGBM] [Info] Start training from score -5.342616
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[858]	training's auc: 0.932478	valid_1's auc: 0.920817	valid_2's auc: 0.913897
Model Training Completed.....


In [17]:
y_pred = model.predict(test_X)
y_pred = (y_pred > 0.5) * 1

In [18]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test_y, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Negatives(TN) = ', cm[0,0])
print('\nTrue Positives(TP) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])
print('\nTrue Positive Rate = ', cm[1,1] / cm[1].sum())
print('\nTrue Negative Rate = ', cm[0,0] / cm[0].sum())
print('\nbacc = ', ((cm[1,1] / cm[1].sum()) + (cm[0,0] / cm[0].sum()))/2)

Confusion matrix

 [[198915   6517]
 [   429    482]]

True Negatives(TN) =  198915

True Positives(TP) =  482

False Positives(FP) =  6517

False Negatives(FN) =  429

True Positive Rate =  0.5290889132821076

True Negative Rate =  0.9682766073445228

bacc =  0.7486827603133153


In [19]:
from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred, target_names=['class 0', 'class 1']))

              precision    recall  f1-score   support

     class 0       1.00      0.97      0.98    205432
     class 1       0.07      0.53      0.12       911

    accuracy                           0.97    206343
   macro avg       0.53      0.75      0.55    206343
weighted avg       0.99      0.97      0.98    206343



In [20]:
from sklearn.metrics import f1_score
print(f"f1-score : {f1_score(test_y, y_pred)}")

f1-score : 0.12187104930467761
