In [1]:
import pandas as pd
import csv
import os
import sys
import csv

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

sys.path.insert(0, f"{os.path.dirname(os.getcwd())}/src")
from utils import evaluate_result
from data_imbalance_src.oversampling import RandomOversampling, ADASYNOversampling, BorderlineSMOTEOversampling, SMOTEOversampling, SVMSMOTEOversampling

# JavaScript_Vulnerability

In [2]:
write_path = f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv"
with open(write_path, "w", newline="") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["oversampling_scheme", "runtime", "learner", "acc", "prec", "recall", "fpr", "f1", "auc", "g_score", "d2h"])

In [3]:
data_path = f"{os.path.dirname(os.getcwd())}/data/JavaScript_Vulnerability/"
datafiles = [f for f in os.listdir(data_path) if f.endswith("csv")]
datafiles

['JSVulnerabilityDataSet-1.0.csv']

In [4]:
df = pd.read_csv(f"{data_path}/{datafiles[0]}")
drop_columns = ["name", "longname", "path", "full_repo_path", "line", "column", "endline", "endcolumn"]
df = df.drop(drop_columns, axis=1)
df = df.drop_duplicates()

In [5]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
print("y value counts: \n", str(y.value_counts()))
print("y class ratio: 1:", str(round(y.value_counts()[0]/y.value_counts()[1])))

y value counts: 
 0    5367
1     904
Name: Vuln, dtype: int64
y class ratio: 1: 6


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("--- y train classes count: \n" + str(y_train.value_counts()))
print("--- y train ratio: 1:" + str(round(y_train.value_counts()[0] / y_train.value_counts()[1])))
print(" ")
print("--- y test classes count: \n" + str(y_test.value_counts()))
print("--- y test ratio: 1:" + str(round(y_test.value_counts()[0] / y_test.value_counts()[1])))

--- y train classes count: 
0    4293
1     723
Name: Vuln, dtype: int64
--- y train ratio: 1:6
 
--- y test classes count: 
0    1074
1     181
Name: Vuln, dtype: int64
--- y test ratio: 1:6


### Normal Run

In [7]:
# normal run - without any oversampling technique
# inputs: X_train, y_train, X_test, y_test

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train, y_train)
clf_KNN.fit(X_train, y_train)
clf_LR.fit(X_train, y_train)
clf_DT.fit(X_train, y_train)
clf_RF.fit(X_train, y_train)
clf_LightGBM.fit(X_train, y_train)
clf_Adaboost.fit(X_train, y_train)
clf_GBDT.fit(X_train, y_train)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

In [8]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["No", 0, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["No", 0, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["No", 0, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["No", 0, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["No", 0, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["No", 0, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["No", 0, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["No", 0, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### Random Oversampling

In [10]:
# random oversampling run - random oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = RandomOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [11]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["Random", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["Random", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["Random", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["Random", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["Random", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["Random", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["Random", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["Random", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### ADASYN Oversampling

In [12]:
# ADASYN oversampling run - ADASYN oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = ADASYNOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [13]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["ADASYN", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["ADASYN", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["ADASYN", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["ADASYN", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["ADASYN", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["ADASYN", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["ADASYN", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["ADASYN", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### BorderlineSMOTE Oversampling

In [14]:
# BorderlineSMOTE oversampling run - BorderlineSMOTE oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = BorderlineSMOTEOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [15]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["BorderlineSMOTE", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["BorderlineSMOTE", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### SMOTE Oversampling

In [16]:
# SMOTE oversampling run - SMOTE oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = SMOTEOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [17]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SMOTE", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SMOTE", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SMOTE", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SMOTE", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SMOTE", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SMOTE", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SMOTE", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SMOTE", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))

### SVMSMOTE Oversampling

In [18]:
# SVMSMOTE oversampling run - SVMSMOTE oversampling technique
# inputs: X_train_random, y_train_random, X_test, y_test

rt, X_train_new, y_train_new = SVMSMOTEOversampling(X_train=X_train, y_train=y_train)

print("y train ratio: 1:" + str(round(y_train_new.value_counts()[0] / y_train_new.value_counts()[1])))

# create models
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train_new, y_train_new)
clf_KNN.fit(X_train_new, y_train_new)
clf_LR.fit(X_train_new, y_train_new)
clf_DT.fit(X_train_new, y_train_new)
clf_RF.fit(X_train_new, y_train_new)
clf_LightGBM.fit(X_train_new, y_train_new)
clf_Adaboost.fit(X_train_new, y_train_new)
clf_GBDT.fit(X_train_new, y_train_new)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

y train ratio: 1:1


In [19]:
with open(f"{os.path.dirname(os.getcwd())}/result/JS_Vuln_res.csv", "a", newline="") as f:
    csv_writer = csv.writer(f)
    
    csv_writer.writerow(["SVMSMOTE", rt, "SVM"] + evaluate_result(y_pred_SVM, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "KNN"] + evaluate_result(y_pred_KNN, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "LR"] + evaluate_result(y_pred_LR, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "DT"] + evaluate_result(y_pred_DT, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "RF"] + evaluate_result(y_pred_RF, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "LightGBM"] + evaluate_result(y_pred_LightGBM, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "Adaboost"] + evaluate_result(y_pred_Adaboost, y_test))
    csv_writer.writerow(["SVMSMOTE", rt, "GBDT"] + evaluate_result(y_pred_GBDT, y_test))