In [16]:
import pandas as pd
import csv
import os
import sys
import csv
import time
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sys.path.insert(0, f"{os.path.dirname(os.getcwd())}/src")
from utils import evaluate_result
from data_imbalance_src.smote_oversampling import RandomOversampling, ADASYNOversampling, BorderlineSMOTEOversampling, SMOTEOversampling, SVMSMOTEOversampling
from data_imbalance_src.smote_oversampling import SMOTUNEDOversampling
from data_imbalance_src.dazzle import DAZZLEOversampling
from data_imbalance_src.Imbalance_Farou2022.data_generation import GANOversampling
from data_imbalance_src.random_projection import RandomProjectionOversampling

# JavaScript_Vulnerability

In [3]:
write_path = f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv"
with open(write_path, "w", newline="") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["oversampling_scheme", "runtime", "learner", "acc", "prec", "recall", "fpr", "f1", "auc", "g_score", "d2h"])

In [30]:
data_path = f"{os.path.dirname(os.getcwd())}/data/JavaScript_Vulnerability/"
datafiles = [f for f in os.listdir(data_path) if f.endswith("csv")]
datafiles

['JSVulnerabilityDataSet-1.0.csv']

In [31]:
df = pd.read_csv(f"{data_path}/{datafiles[0]}")
drop_columns = ["name", "longname", "path", "full_repo_path", "line", "column", "endline", "endcolumn"]
df = df.drop(drop_columns, axis=1)
df = df.drop_duplicates()
df.reset_index(inplace=True, drop=True)

In [32]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
print("y value counts: \n", str(y.value_counts()))
print("y class ratio: 1:", str(round(y.value_counts()[0]/y.value_counts()[1])))

y value counts: 
 0    5367
1     904
Name: Vuln, dtype: int64
y class ratio: 1: 6


In [33]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)
X = pd.DataFrame(X_scale, columns=X.columns, index=X.index)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("--- y train classes count: \n" + str(y_train.value_counts()))
print("--- y train ratio: 1:" + str(round(y_train.value_counts()[0] / y_train.value_counts()[1])))
print(" ")
print("--- y test classes count: \n" + str(y_test.value_counts()))
print("--- y test ratio: 1:" + str(round(y_test.value_counts()[0] / y_test.value_counts()[1])))

--- y train classes count: 
0    4293
1     723
Name: Vuln, dtype: int64
--- y train ratio: 1:6
 
--- y test classes count: 
0    1074
1     181
Name: Vuln, dtype: int64
--- y test ratio: 1:6


### Normal Run

In [17]:
# normal run - without any oversampling technique
# inputs: X_train, y_train, X_test, y_test

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train, y_train)
clf_KNN.fit(X_train, y_train)
clf_LR.fit(X_train, y_train)
clf_DT.fit(X_train, y_train)
clf_RF.fit(X_train, y_train)
clf_LightGBM.fit(X_train, y_train)
clf_Adaboost.fit(X_train, y_train)
clf_GBDT.fit(X_train, y_train)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

In [18]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["No", 0, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["No", 0, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["No", 0, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["No", 0, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["No", 0, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["No", 0, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["No", 0, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["No", 0, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### Random Oversampling

In [19]:
# random oversampling run - random oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = RandomOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [20]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["Random", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["Random", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["Random", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["Random", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["Random", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["Random", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["Random", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["Random", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### ADASYN Oversampling

In [21]:
# ADASYN oversampling run - ADASYN oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = ADASYNOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [22]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["ADASYN", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["ADASYN", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["ADASYN", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["ADASYN", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["ADASYN", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["ADASYN", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["ADASYN", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["ADASYN", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### BorderlineSMOTE Oversampling

In [23]:
# BorderlineSMOTE oversampling run - BorderlineSMOTE oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = BorderlineSMOTEOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [24]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["BorderlineSMOTE", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### SMOTE Oversampling

In [25]:
# SMOTE oversampling run - SMOTE oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = SMOTEOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [26]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SMOTE", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SMOTE", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SMOTE", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SMOTE", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SMOTE", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SMOTE", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SMOTE", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SMOTE", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### SVMSMOTE Oversampling

In [27]:
# SVMSMOTE oversampling run - SVMSMOTE oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = SVMSMOTEOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [28]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SVMSMOTE", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### SMOTUNED Oversampling

In [29]:
# SMOTUNED oversampling run - SMOTUNED oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt_SVM, X_train_new_SVM, y_train_new_SVM = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                                y_train=y_train, y_test=y_test, model="SVM")
print("y train ratio of SVM: 1:" + str(round(y_train_new_SVM.value_counts()[0] / y_train_new_SVM.value_counts()[1])))

rt_KNN, X_train_new_KNN, y_train_new_KNN = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                                y_train=y_train, y_test=y_test, model="KNN")
print("y train ratio of KNN: 1:" + str(round(y_train_new_KNN.value_counts()[0] / y_train_new_KNN.value_counts()[1])))

rt_LR, X_train_new_LR, y_train_new_LR = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                             y_train=y_train, y_test=y_test, model="LR")
print("y train ratio of LR: 1:" + str(round(y_train_new_LR.value_counts()[0] / y_train_new_LR.value_counts()[1])))

rt_DT, X_train_new_DT, y_train_new_DT = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                             y_train=y_train, y_test=y_test, model="DT")
print("y train ratio of DT: 1:" + str(round(y_train_new_DT.value_counts()[0] / y_train_new_DT.value_counts()[1])))

rt_RF, X_train_new_RF, y_train_new_RF = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                             y_train=y_train, y_test=y_test, model="RF")
print("y train ratio of RF: 1:" + str(round(y_train_new_RF.value_counts()[0] / y_train_new_RF.value_counts()[1])))

rt_LightGBM, X_train_new_LightGBM, y_train_new_LightGBM = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                                               y_train=y_train, y_test=y_test, model="LightGBM")
print("y train ratio of LightGBM: 1:" + str(round(y_train_new_LightGBM.value_counts()[0] / y_train_new_LightGBM.value_counts()[1])))

rt_Adaboost, X_train_new_Adaboost, y_train_new_Adaboost = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                                               y_train=y_train, y_test=y_test, model="Adaboost")
print("y train ratio of Adaboost: 1:" + str(round(y_train_new_Adaboost.value_counts()[0] / y_train_new_Adaboost.value_counts()[1])))

rt_GBDT, X_train_new_GBDT, y_train_new_GBDT = SMOTUNEDOversampling(X_train=X_train, X_test=X_test, 
                                                                   y_train=y_train, y_test=y_test, model="GBDT")
print("y train ratio of GBDT: 1:" + str(round(y_train_new_GBDT.value_counts()[0] / y_train_new_GBDT.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new_SVM, y_train_new_SVM)
clf_KNN.fit(X_train_new_KNN, y_train_new_KNN)
clf_LR.fit(X_train_new_LR, y_train_new_LR)
clf_DT.fit(X_train_new_DT, y_train_new_DT)
clf_RF.fit(X_train_new_RF, y_train_new_RF)
clf_LightGBM.fit(X_train_new_LightGBM, y_train_new_LightGBM)
clf_Adaboost.fit(X_train_new_Adaboost, y_train_new_Adaboost)
clf_GBDT.fit(X_train_new_GBDT, y_train_new_GBDT)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio of SVM: 1:1
y train ratio of KNN: 1:1
y train ratio of LR: 1:1
y train ratio of DT: 1:1
y train ratio of RF: 1:1
y train ratio of LightGBM: 1:1
y train ratio of Adaboost: 1:1
y train ratio of GBDT: 1:1


In [30]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SMOTUNED", rt_SVM, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SMOTUNED", rt_KNN, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SMOTUNED", rt_LR, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SMOTUNED", rt_DT, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SMOTUNED", rt_RF, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SMOTUNED", rt_LightGBM, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SMOTUNED", rt_Adaboost, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SMOTUNED", rt_GBDT, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### DAZZLE Oversampling

In [31]:
# DAZZLE oversampling run - DAZZLE oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = DAZZLEOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

  0%|                                                                           | 0/100 [00:00<?, ?trial/s, best loss=?]

2023-09-07 14:33:25.756658: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-09-07 14:33:25.758365: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


100%|██████████████████████████████████████████████| 100/100 [00:47<00:00,  2.09trial/s, best loss: -0.9074079856732415]
Best Hyperparameters: {'batch_size': 16, 'discriminator_activation_fn': <function relu at 0x7fd4583689d0>, 'discriminator_layer_normalization': True, 'discriminator_lr': 0.005861854470139931, 'discriminator_optimizer': <class 'keras.src.optimizers.nadam.Nadam'>, 'epochs': 10, 'generator_activation_fn': <function leaky_relu at 0x7fd3c42d1550>, 'generator_layer_normalization': False, 'generator_lr': 0.0005962450444945657, 'generator_optimizer': <class 'keras.src.optimizers.rmsprop.RMSprop'>}
Best G-Measure: 0.9074079856732415
 27/157 [====>.........................] - ETA: 0s



y train ratio: 1:1


In [32]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["DAZZLE", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["DAZZLE", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["DAZZLE", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["DAZZLE", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["DAZZLE", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["DAZZLE", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["DAZZLE", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["DAZZLE", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### WGAN Oversampling

In [33]:
# WGAN oversampling run - WGAN oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = GANOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

MAX CLASS 4293
CLASS ID 1
Epoch 1/150 completed. Gen loss: 0.026012679561972618. Desc loss_real: -0.009756787680089474. Desc loss_fake: -0.026012679561972618
Epoch 51/150 completed. Gen loss: -0.0012427280889824033. Desc loss_real: 0.003576418850570917. Desc loss_fake: 0.0012427280889824033
Epoch 101/150 completed. Gen loss: 1.1219585758226458e-05. Desc loss_real: 0.0019181367242708802. Desc loss_fake: -1.1219585758226458e-05


  new_data = new_data.append(synthetic_data)
  X_sample = X_sample.append(X_train)
  X_sample = X_sample.drop(tar, 1)


y train ratio: 1:1


In [34]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["WGAN", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["WGAN", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["WGAN", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["WGAN", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["WGAN", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["WGAN", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["WGAN", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["WGAN", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### Random Projection Oversampling

In [6]:
# Random projection oversampling run - Random projection oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = RandomProjectionOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [7]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["RP", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["RP", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["RP", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["RP", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["RP", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["RP", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["RP", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["RP", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### Diveplane Oversampling

In [10]:
from diveplane.utilities import infer_feature_attributes
from diveplane.geminai import Geminai

In [33]:
tar = y_train.name
conditions = [{tar: 1},
              {tar: 0}] * (int(X_train.shape[0] / 2))

X_train[tar] = y_train
partial_features = {"CLLC": {'type': "continuous"}}
features = infer_feature_attributes(X_train, features=partial_features)
for f_name, f_value in features.items():
    if f_value["type"] == "nominal":
        f_value["non_sensitive"] = True

start_time = time.time()
g = Geminai()
g.train(X_train, features=features)

gen_df = g.synthesize_cases(
    n_samples=len(conditions),
    case_context_values_maps=conditions,
    generate_new_cases="no"
)

rt = time.time() - start_time

X_train = X_train.iloc[:, :-1]
X_train_new = gen_df.iloc[:, :-1]
y_train_new = gen_df.iloc[:, -1]



In [34]:
print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [35]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["Diveplane", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["Diveplane", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["Diveplane", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["Diveplane", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["Diveplane", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["Diveplane", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["Diveplane", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["Diveplane", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### DS Oversampling

In [25]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network
mode = "independent_attribute_mode"

col = X_train.columns
tar = y_train.name
X_train[tar] = y_train
write_df = X_train[X_train[tar] == 1]
write_df = write_df.iloc[:, :-1]
write_df.to_csv(f"{os.path.dirname(os.getcwd())}/extra/js_vuln_pos_df.csv", index=False)
X_train = X_train.iloc[:, :-1]

threshold = 20
num_tuples_to_generate = int(y_train.value_counts()[0] - y_train.value_counts()[1])

start_time = time.time()

description_file = f"{os.path.dirname(os.getcwd())}/extra/js_vuln.json"
describer = DataDescriber(category_threshold=threshold)
describer.describe_dataset_in_independent_attribute_mode(
    dataset_file=f"{os.path.dirname(os.getcwd())}/extra/js_vuln_pos_df.csv"
)
describer.save_dataset_description_to_file(description_file)

generator = DataGenerator()
generator.generate_dataset_in_independent_mode(num_tuples_to_generate, description_file)
generator.save_synthetic_data(f"{os.path.dirname(os.getcwd())}/extra/js_vuln_syn_df.csv")

rt = time.time() - start_time

X_train_new = pd.read_csv(f"{os.path.dirname(os.getcwd())}/extra/js_vuln_syn_df.csv").to_numpy()
y_train_new = np.ones(num_tuples_to_generate)
X_train_new = pd.DataFrame(np.vstack((X_train.to_numpy(), X_train_new)), columns=col)
y_train_new = pd.Series(np.hstack((y_train.to_numpy(), y_train_new)), name=tar)

In [27]:
print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [28]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["DS", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["DS", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["DS", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["DS", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["DS", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["DS", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["DS", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["DS", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### SDV Oversampling

In [34]:
from sdv.metadata import SingleTableMetadata
from sdv.lite import SingleTablePreset
from sdv.single_table import CTGANSynthesizer
from sdv.single_table import GaussianCopulaSynthesizer

In [35]:
col = X_train.columns
tar = y_train.name
num_tuples_to_generate = int(y_train.value_counts()[0] - y_train.value_counts()[1])
X_train[tar] = y_train
pos_df = X_train[X_train[tar] == 1]
pos_df = pos_df.iloc[:, :-1]
X_train = X_train.iloc[:, :-1]

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=pos_df)

In [40]:
start_time = time.time()
syn1 = SingleTablePreset(metadata, name="FAST_ML")
syn1.fit(data=pos_df)
X_train_new = syn1.sample(num_rows=num_tuples_to_generate).to_numpy()

rt = time.time() - start_time

X_train_new = pd.DataFrame(np.vstack((X_train.to_numpy(), X_train_new)), columns=col)
y_train_new = np.ones(num_tuples_to_generate)
y_train_new = pd.Series(np.hstack((y_train.to_numpy(), y_train_new)), name=tar)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [41]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SDV_FASTML", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SDV_FASTML", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SDV_FASTML", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SDV_FASTML", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SDV_FASTML", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SDV_FASTML", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SDV_FASTML", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SDV_FASTML", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

In [44]:
start_time = time.time()
syn2 = GaussianCopulaSynthesizer(metadata)
syn2.fit(data=pos_df)
X_train_new = syn2.sample(num_rows=num_tuples_to_generate).to_numpy()

rt = time.time() - start_time

X_train_new = pd.DataFrame(np.vstack((X_train.to_numpy(), X_train_new)), columns=col)
y_train_new = np.ones(num_tuples_to_generate)
y_train_new = pd.Series(np.hstack((y_train.to_numpy(), y_train_new)), name=tar)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)



y train ratio: 1:1


In [45]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SDV_GC", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SDV_GC", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SDV_GC", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SDV_GC", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SDV_GC", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SDV_GC", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SDV_GC", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SDV_GC", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

In [None]:
start_time = time.time()
syn3 = CTGANSynthesizer(metadata)
syn3.fit(data=pos_df)
X_train_new = syn3.sample(num_rows=num_tuples_to_generate).to_numpy()

rt = time.time() - start_time

X_train_new = pd.DataFrame(np.vstack((X_train.to_numpy(), X_train_new)), columns=col)
y_train_new = np.ones(num_tuples_to_generate)
y_train_new = pd.Series(np.hstack((y_train.to_numpy(), y_train_new)), name=tar)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)





In [None]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SDV_GAN", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SDV_GAN", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SDV_GAN", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SDV_GAN", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SDV_GAN", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SDV_GAN", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SDV_GAN", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SDV_GAN", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))