In [1]:
import pandas as pd
import csv
import os
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

sys.path.insert(0, f"{os.path.dirname(os.getcwd())}/src")
from utils import evaluate_result

# JavaScript_Vulnerability

In [2]:
data_path = f"{os.path.dirname(os.getcwd())}/data/JavaScript_Vulnerability/"
datafiles = [f for f in os.listdir(data_path) if f.endswith("csv")]
datafiles

['JSVulnerabilityDataSet-1.0.csv']

In [5]:
df = pd.read_csv(f"{data_path}/{datafiles[0]}")
drop_columns = ["name", "longname", "path", "full_repo_path", "line", "column", "endline", "endcolumn"]
df = df.drop(drop_columns, axis=1)
df = df.drop_duplicates()

In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
print("y value counts: \n", str(y.value_counts()))
print("y class ratio: 1:", str(round(y.value_counts()[0]/y.value_counts()[1])))

y value counts: 
 0    5367
1     904
Name: Vuln, dtype: int64
y class ratio: 1: 6


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("--- y train classes count: \n" + str(y_train.value_counts()))
print("--- y train ratio: 1:" + str(round(y_train.value_counts()[0] / y_train.value_counts()[1])))
print(" ")
print("--- y test classes count: \n" + str(y_test.value_counts()))
print("--- y test ratio: 1:" + str(round(y_test.value_counts()[0] / y_test.value_counts()[1])))

--- y train classes count: 
0    4293
1     723
Name: Vuln, dtype: int64
--- y train ratio: 1:6
 
--- y test classes count: 
0    1074
1     181
Name: Vuln, dtype: int64
--- y test ratio: 1:6


## Normal Run

In [8]:
clf_SVM = SVC()
clf_KNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
clf_LR = LogisticRegression(random_state=42, solver="saga", max_iter=20000, n_jobs=-1)
clf_DT = DecisionTreeClassifier()
clf_RF = RandomForestClassifier(random_state=42, n_jobs=-1)
clf_LightGBM = LGBMClassifier(objective="binary", random_state=42, n_jobs=-1)
clf_Adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
clf_GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

clf_SVM.fit(X_train, y_train)
clf_KNN.fit(X_train, y_train)
clf_LR.fit(X_train, y_train)
clf_DT.fit(X_train, y_train)
clf_RF.fit(X_train, y_train)
clf_LightGBM.fit(X_train, y_train)
clf_Adaboost.fit(X_train, y_train)
clf_GBDT.fit(X_train, y_train)

y_pred_SVM = clf_SVM.predict(X_test)
y_pred_KNN = clf_KNN.predict(X_test)
y_pred_LR = clf_LR.predict(X_test)
y_pred_DT = clf_DT.predict(X_test)
y_pred_RF = clf_RF.predict(X_test)
y_pred_LightGBM = clf_LightGBM.predict(X_test)
y_pred_Adaboost = clf_Adaboost.predict(X_test)
y_pred_GBDT = clf_GBDT.predict(X_test)

In [9]:
tn, fp, fn, tp, recall, fpr, prec, acc, f1, auc, g_score, d2h = evaluate_result(y_pred_SVM, y_test)