Load data

In [1]:
from experiment.src.data_loader import read_detected_data, read_metadata, join_label, get_missing, eval_no_model, \
    get_y_labels, eval_with_model
from copy import deepcopy
from experiment.src.split import load_fixed_split
from experiment.src.features import prepare_data
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Input, Concatenate
from tensorflow.keras.models import Model
from skopt.space import Integer, Categorical
import matplotlib.pyplot as plt
import gc
from datetime import datetime
from skopt.utils import use_named_args
from keras.callbacks import EarlyStopping 

from experiment.src.prepare_data import prepare_train_data
import numpy as np
import yaml
from pathlib import Path
import json 
from typing import Tuple, List


2024-02-22 09:24:58.716420: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 09:24:58.716621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 09:24:58.719146: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 09:24:58.730594: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def get_predictions_keras(model: Model, data: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
    probability = model.predict(data).ravel()
    prediction = probability > 0.5

    return prediction, probability

In [3]:
def get_class_weights(y_data):
        class_weights = compute_class_weight('balanced', classes=np.unique(y_data), y=y_data)
        return dict(enumerate(class_weights))

In [4]:
def save_history_acc_loss(history, dst_dir):
    # summarize history for accuracy
    plt.clf()
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(dst_dir + '/accuracy.png')
    
    plt.clf()
    
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(dst_dir + '/loss.png')
    plt.clf()

In [5]:
def save_model(model, params, train_history, eval_res):
    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_file_dir = f"results/ml_{eval_res['F1']:.6f}-{current_time}"
    Path(model_file_dir).mkdir(parents=True, exist_ok=True)
    model.save(f"{model_file_dir}/model")
    save_history_acc_loss(train_history, f"{model_file_dir}/")
    with open(f"{model_file_dir}/info.txt", 'w') as f:
        f.write(json.dumps(params, default=str))
        f.write('\n')
        f.write(json.dumps(eval_res, default=str))
    print(f'Saving model {model_file_dir}')

In [6]:
tf.config.list_physical_devices('GPU')

2024-02-22 09:25:08.457688: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-22 09:25:08.540434: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-22 09:25:08.540823: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [7]:
with open('env.yaml', 'r') as yaml_settings:
    args = yaml.load(yaml_settings, yaml.FullLoader)
cred_data_location = args[0]['cred_data_location']
print(cred_data_location)

/mnt/c/Users/yuliia.t/CredData


In [8]:
detected_data = read_detected_data("wsl/result.json")
meta_data = read_metadata(f"{cred_data_location}/meta")

detected_data_copy = deepcopy(detected_data)
meta_data_copy = deepcopy(meta_data)

    # Combine original and augmented data together
aug_detected_data = read_detected_data("wsl/result_aug_data.json", "aug_data/")
detected_data.update(aug_detected_data)
aug_metadata = read_metadata(f"{cred_data_location}/aug_data/meta", "aug_data/")
meta_data.update(aug_metadata)

Reading detections from wsl/result.json
Detected 19780 unique lines!
23805 detections in total
Reading meta from /mnt/c/Users/yuliia.t/CredData/meta
Loaded 64394 lines from meta!
64428 lines in meta in total
Reading detections from wsl/result_aug_data.json
Detected 144330 unique lines!
155787 detections in total
Reading meta from /mnt/c/Users/yuliia.t/CredData/aug_data/meta
Loaded 40331 lines from meta!
40331 lines in meta in total


In [9]:
df = join_label(detected_data, meta_data)

train_repo_list, test_repo_list = load_fixed_split()

df_train = df[df["repo"].isin(train_repo_list)]

In [10]:
print(f"Train size: {len(df_train)}")

df_train = df_train.drop_duplicates(subset=["line", "ext"])
print(f"Train size after drop_duplicates: {len(df_train)}")

X_train_value, X_train_features = prepare_data(df_train)
y_train = get_y_labels(df_train)

print(f"Class-1 prop on train: {np.mean(y_train):.2f}")

Train size: 16625
Train size after drop_duplicates: 11457
Class-1 prop on train: 0.22


In [11]:
get_class_weights(y_train)

{0: 0.641633064516129, 1: 2.2651245551601424}

In [12]:
vocab_size = X_train_value.shape[-1]
feature_size = X_train_features.shape[-1]

Setup parameters

In [13]:
DEFAULT_METRICS = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]# 'sparse_categorical_accuracy']

2024-02-22 09:27:09.626340: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-22 09:27:09.626597: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-22 09:27:09.626726: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-22 09:27:10.975102: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-22 09:27:10.975250: I external/local_xla/xla/stream_executor

In [14]:
lstm_units = 128
dense_1_units = 128
batch_size = 64
epochs = 100
activation = ['softmax','relu', 'tanh', 'sigmoid']#'softmax', 'linear']
# Hyperparameters
space = [
    # Model params
    Integer(1, 500, name="lstm_units"),
    Integer(1, 300, name="dense_1_units"),
    Categorical(activation, name="activation"),
    Categorical([0.1, 0.01, 0.001, 0.0001, 0.00001], name='learning_rate'),

    # Context params
    Integer(10, 80,  name="epochs"),
    Integer(1, 128, name="batch_size")]
    

In [15]:
best_loss = 1
f1 = 0

In [16]:
print("Validate results on the test subset")
df = join_label(detected_data_copy, meta_data_copy)
df_missing = get_missing(detected_data_copy, meta_data_copy)
df_test = df[df["repo"].isin(test_repo_list)]
df_test = df_test[df_test["value"].notna()]

df_missing_test = df_missing[df_missing["repo"].isin(test_repo_list)]
X_test_value, X_test_features = prepare_data(df_test)
y_test = get_y_labels(df_test)

print(f"Test size: {len(df_test)}")

Validate results on the test subset
Test size: 3155


In [17]:
early_stopping_monitor = EarlyStopping(patience=4)

@use_named_args(space)
def train_model(**params):
    print(params)
    lstm_input = Input(shape=(None, vocab_size), name="line_input")
    lstm_branch = Bidirectional(LSTM(params['lstm_units']))(lstm_input)
    
    feature_input = Input(shape=(feature_size, ), name="feature_input") 
    joined_features = Concatenate()([lstm_branch, feature_input])
    x = Dense(params['dense_1_units'], activation=params['activation'], name="Dense_1")(joined_features)
    x = Dense(1, activation='sigmoid', name="prediction")(x)
    
    model = Model([lstm_input, feature_input], x)
    
    adam_optimizer =  tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=DEFAULT_METRICS)
    hist = model.fit(
        [X_train_value, X_train_features],
        y_train,
        #validation_split=0.2,
        validation_data = ([X_test_value, X_test_features], y_test),
        batch_size=params['batch_size'],
        epochs=params['epochs'],
        class_weight={
           0: 0.641633064516129, 1: 2.2651245551601424})
        #callbacks=[early_stopping_monitor])

    print(f"Min training loss={min(hist.history['loss'])}, min validation loss={min(hist.history['val_loss'])}")
    test_predictions, test_probabilities = get_predictions_keras(model, [X_test_value, X_test_features])
    print("Results on test with model:")
    val_res = eval_with_model(df_test, df_missing_test, test_predictions)
        
    save_model(model, params, hist, val_res)
    del model
    gc.collect()
    return val_res['F1'] 

In [None]:
from skopt import gp_minimize
N_TRIALS = 40
SEED = 5       # 2020
np.int = int

search_result = gp_minimize(func=train_model, dimensions=space, n_calls=N_TRIALS,random_state=SEED)

{'lstm_units': 29, 'dense_1_units': 250, 'activation': 'sigmoid', 'learning_rate': 0.1, 'epochs': 16, 'batch_size': 51}


2024-02-22 09:27:30.223144: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 1/16


2024-02-22 09:27:40.756415: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-02-22 09:27:41.807348: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f8740b2dd70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-22 09:27:41.807426: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce MX150, Compute Capability 6.1
2024-02-22 09:27:41.823168: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1708586861.990588    3425 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Min training loss=0.2303209900856018, min validation loss=0.2130480259656906
Results on test with model:
TP : 517, FP : 413, TN : 12573, FN : 222, FPR : 0.031803, FNR : 0.300406, PRC : 0.555914, RCL : 0.699594, F1 : 0.619533
INFO:tensorflow:Assets written to: results/ml_0.619533-20240222_093550/model/assets


2024-02-22 09:36:06,561 | INFO | builder_impl.py:801 | Assets written to: results/ml_0.619533-20240222_093550/model/assets
2024-02-22 09:36:07,068 | DEBUG | pyplot.py:414 | Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-02-22 09:36:07,073 | DEBUG | pyplot.py:414 | Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-02-22 09:36:07,095 | DEBUG | font_manager.py:1411 | findfont: Matching sans\-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=10.0.
2024-02-22 09:36:07,100 | DEBUG | font_manager.py:1423 | findfont: score(FontEntry(fname='/home/ubuntu/.virtualenvs/CredSweeper2/lib/python3.10/site-packages/matplotlib/mpl-data/fonts/ttf/STIXSizOneSymReg.ttf', name='STIXSizeOneSym', style='normal', variant='normal', weight=400, stretch='normal', size='scalable')) = 10.05
2024-02-22 09:36:07,103 | DEBUG | font_manager.py:1423 | findfont: score(FontEntry(fname='/home/ubuntu/.virtualenvs/CredSweeper2/lib/python3.10/site-

Saving model results/ml_0.619533-20240222_093550
{'lstm_units': 178, 'dense_1_units': 147, 'activation': 'tanh', 'learning_rate': 0.01, 'epochs': 55, 'batch_size': 105}
Epoch 1/55
Epoch 2/55
Epoch 3/55
Epoch 4/55
Epoch 5/55
Epoch 6/55
Epoch 7/55
Epoch 8/55
Epoch 9/55
Epoch 10/55
Epoch 11/55
Epoch 12/55
Epoch 13/55
Epoch 14/55
Epoch 15/55
Epoch 16/55
Epoch 17/55
Epoch 18/55
Epoch 19/55
Epoch 20/55
Epoch 21/55
Epoch 22/55
Epoch 23/55
Epoch 24/55
Epoch 25/55
Epoch 26/55
Epoch 27/55
Epoch 28/55
Epoch 29/55
Epoch 30/55
Epoch 31/55
Epoch 32/55
Epoch 33/55
Epoch 34/55
Epoch 35/55
Epoch 36/55
Epoch 37/55
Epoch 38/55
Epoch 39/55
Epoch 40/55
Epoch 41/55
Epoch 42/55
Epoch 43/55
Epoch 44/55
Epoch 45/55
Epoch 46/55
Epoch 47/55
Epoch 48/55
Epoch 49/55
Epoch 50/55
Epoch 51/55
Epoch 52/55
Epoch 53/55
Epoch 54/55
Epoch 55/55
Min training loss=0.003188316011801362, min validation loss=0.09241655468940735
Results on test with model:
TP : 494, FP : 187, TN : 12799, FN : 245, FPR : 0.014400, FNR : 0.331529

2024-02-22 09:56:05,615 | INFO | builder_impl.py:801 | Assets written to: results/ml_0.695775-20240222_095548/model/assets


Saving model results/ml_0.695775-20240222_095548
{'lstm_units': 122, 'dense_1_units': 230, 'activation': 'relu', 'learning_rate': 0.0001, 'epochs': 18, 'batch_size': 112}
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18
Min training loss=0.1196984276175499, min validation loss=0.1086200401186943
Results on test with model:
TP : 516, FP : 82, TN : 12904, FN : 223, FPR : 0.006314, FNR : 0.301759, PRC : 0.862876, RCL : 0.698241, F1 : 0.771877
INFO:tensorflow:Assets written to: results/ml_0.771877-20240222_100223/model/assets


2024-02-22 10:02:42,640 | INFO | builder_impl.py:801 | Assets written to: results/ml_0.771877-20240222_100223/model/assets


Saving model results/ml_0.771877-20240222_100223
{'lstm_units': 262, 'dense_1_units': 148, 'activation': 'softmax', 'learning_rate': 1e-05, 'epochs': 17, 'batch_size': 106}
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Min training loss=0.6531913876533508, min validation loss=0.6202701926231384
Results on test with model:
TP : 232, FP : 83, TN : 12903, FN : 507, FPR : 0.006391, FNR : 0.686062, PRC : 0.736508, RCL : 0.313938, F1 : 0.440228
INFO:tensorflow:Assets written to: results/ml_0.440228-20240222_104009/model/assets


2024-02-22 10:40:17,584 | INFO | builder_impl.py:801 | Assets written to: results/ml_0.440228-20240222_104009/model/assets


Saving model results/ml_0.440228-20240222_104009
{'lstm_units': 417, 'dense_1_units': 268, 'activation': 'tanh', 'learning_rate': 0.001, 'epochs': 16, 'batch_size': 127}
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
Min training loss=0.06726254522800446, min validation loss=0.09763825684785843
Results on test with model:
TP : 501, FP : 82, TN : 12904, FN : 238, FPR : 0.006314, FNR : 0.322057, PRC : 0.859348, RCL : 0.677943, F1 : 0.757943
INFO:tensorflow:Assets written to: results/ml_0.757943-20240222_115617/model/assets


2024-02-22 11:56:25,428 | INFO | builder_impl.py:801 | Assets written to: results/ml_0.757943-20240222_115617/model/assets


Saving model results/ml_0.757943-20240222_115617
{'lstm_units': 239, 'dense_1_units': 206, 'activation': 'tanh', 'learning_rate': 0.001, 'epochs': 49, 'batch_size': 48}
Epoch 1/49
Epoch 2/49
Epoch 3/49
Epoch 4/49
Epoch 5/49
Epoch 6/49
Epoch 7/49
Epoch 8/49
Epoch 9/49
Epoch 10/49
Epoch 11/49
Epoch 12/49
Epoch 13/49
Epoch 14/49
Epoch 15/49
Epoch 16/49
Epoch 17/49
Epoch 18/49
Epoch 19/49
Epoch 20/49
Epoch 21/49
Epoch 22/49
Epoch 23/49
Epoch 24/49
Epoch 25/49
Epoch 26/49
Epoch 27/49
Epoch 28/49
Epoch 29/49
Epoch 30/49
Epoch 31/49
Epoch 32/49
Epoch 33/49
Epoch 34/49
Epoch 35/49
Epoch 36/49
Epoch 37/49
Epoch 38/49
Epoch 39/49
Epoch 40/49
Epoch 41/49
Epoch 42/49
Epoch 43/49
Epoch 44/49
Epoch 45/49
Epoch 46/49
Epoch 47/49
Epoch 48/49
Epoch 49/49
Min training loss=0.00335323135368526, min validation loss=0.09812024980783463
Results on test with model:
TP : 525, FP : 70, TN : 12916, FN : 214, FPR : 0.005390, FNR : 0.289581, PRC : 0.882353, RCL : 0.710419, F1 : 0.787106
INFO:tensorflow:Assets wri

2024-02-22 14:12:13,951 | INFO | builder_impl.py:801 | Assets written to: results/ml_0.787106-20240222_141206/model/assets


Saving model results/ml_0.787106-20240222_141206
{'lstm_units': 346, 'dense_1_units': 245, 'activation': 'tanh', 'learning_rate': 1e-05, 'epochs': 60, 'batch_size': 86}
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
 30/134 [=====>........................] - ETA: 3:08 - loss: 0.1698 - accuracy: 0.9473 - precision: 0.8368 - recall: 0.9412

In [None]:
gc.collect()