In [49]:
import pandas as pd
import numpy as np
import datetime
import time
import pickle

In [50]:
from sklearn.svm import OneClassSVM
from sklearn.svm import LinearSVC
from sklearn import preprocessing
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import KFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score, precision_score, recall_score, f1_score
from scipy.stats import uniform, expon

In [4]:
feature_df = pd.read_csv('../data/processed/processed.csv')
feature_df.loc[feature_df.Label == 0, 'Label'] = -1

In [5]:
# feature_x = feature_df.drop(columns=col_exclude_training).to_numpy()
# feature_y = feature_df['Label'].to_numpy()

malicious_df = feature_df.loc[feature_df['Label'] == 1]
# malicious_x = malicious_df.drop(columns=cols_not_in_training).to_numpy()
# malicious_y = malicious_df['Label'].to_numpy()

benign_df = feature_df.loc[feature_df['Label'] == -1]
# benign_x = benign_df.drop(columns=cols_not_in_training).to_numpy()
# benign_y  = benign_df['Label'].to_numpy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(malicious_df, malicious_df['Label'], test_size=0.2, random_state=42)
# df[df.columns.difference(col_exclude_training)]

In [84]:
# test = pd.concat([feature_df.loc[feature_df['Label'] == 1].sample(100, random_state=2),
#                   feature_df.loc[feature_df['Label'] == -1].sample(100, random_state=2)],
#                  axis=0)
# X = test.drop(columns=cols_not_in_training).to_numpy()
# Y = test['Label'].to_numpy()
# X = feature_x
# Y = feature_y

In [51]:
def get_calibrated_clf(clf):
    # clf_2.predict_proba(scaler_2.transform(X))
    return CalibratedClassifierCV(clf)

def get_confusion_matrix(true_label, predict_results):
    #tn, fp, fn, tp = confusion_matrix(true_label, predict_results).ravel()
    return confusion_matrix(true_label, predict_results).ravel()

def df_to_nump(df):
    col_exclude_training = ['StartTime', 'Dir', 'Proto', 'State', 'Label', 'SrcAddr', 'Sport', 'DstAddr', 'Dport', 'sTos', 'dTos', 'is_fwd' ]
    return df.drop(columns=col_exclude_training).to_numpy()

def df_to_labels(df):
    return df['Label'].to_numpy()

def fit_predict_model(clf, X, y, scaler_obj):
    print('Training Model')
    scaled = scaler_obj.fit(X)
    x_scaled = scaled.transform(X)
    self_predict_r = clf.fit(x_scaled, y)
    print('Training Model Completed')
    return {'model': clf, 'self_predict': self_predict_r}

def save_model(model, dir_path, model_name):
    print('Saving Model')
    pickle.dump(model, open(f'{dir_path}{model_name}.sav', 'wb'))

In [78]:
scoring=['accuracy', 'f1', 'recall', 'precision', 'roc_auc']
# roc_auc
#Precision = False Positives, at first should be no false positives
#Recall = False Negativives
# f1 =  2 * (precision * recall)/ (precision + recall)
clf_list = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# clf_list.append(make_pipeline(preprocessing.StandardScaler(), LinearSVC(C=27.534917537749216, dual=False, tol=0.0048028537307841352)))
# clf_list.append(make_pipeline(preprocessing.StandardScaler(), OneClassSVM(kernel="rbf", gamma=0.0121072443425558, cache_size=500, nu=0.11932807423095282)))

# clf_list.append(svm.OneClassSVM(kernel="rbf", gamma=1e-05, cache_size=400, nu=1e-05))
clf_list.append(make_pipeline(preprocessing.StandardScaler(), OneClassSVM(kernel="rbf", gamma=1e-05, cache_size=500, nu=1e-05)))
# clf_list.append(make_pipeline(preprocessing.StandardScaler(), svm.OneClassSVM(kernel="rbf", gamma=1e-02, cache_size=1000, nu=1e-05)))
# clf_list.append(make_pipeline(preprocessing.RobustScaler(), svm.OneClassSVM(kernel="rbf", gamma=1e-05, cache_size=1000, nu=1e-05)))
# clf_list.append(make_pipeline(preprocessing.RobustScaler(), svm.OneClassSVM(kernel="rbf", gamma='scale', cache_size=1000, nu=1e-05)))
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
fold = 1
for clf in clf_list:
    scores = cross_validate(clf, X, Y, scoring='accuracy', cv=skf, n_jobs=5)
    print(scores.keys())
    count = 0
    print(f'----Classifier #{fold}-----')
    print(scores['test_score'])
    fold = fold + 1
    print("Sum Fit Time: %0.5f" % (scores['fit_time'].sum()))
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores['test_accuracy'].mean()*100, scores['test_accuracy'].std() * 2))
    print("Recall: %0.2f (+/- %0.2f)" % (scores['test_recall'].mean()*100, scores['test_recall'].std() * 2))
    print("F1: %0.2f (+/- %0.2f)" % (scores['test_f1'].mean()*100, scores['test_f1'].std() * 2))
    print("Precision: %0.2f (+/- %0.2f)" % (scores['test_precision'].mean()*100, scores['test_precision'].std() * 2))
    print("ROC: %0.2f (+/- %0.2f)" % (scores['test_roc_auc'].mean()*100, scores['test_roc_auc'].std() * 2))
    print()

In [52]:
def hyper_tuning(classifier, tuned_parameters, X, y):
#     scores = ['roc_auc']
    score = 'average_precision'
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        classifier, tuned_parameters, scoring=score, n_jobs=5, cv=5
    )
    scaler = preprocessing.StandardScaler().fit(X)
    clf.fit(scaler.transform(X), y)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()
    return clf.best_params_

def tune_linear_svc(X, y):
    tuned_parameters = [{'tol': expon(scale=.01),
                     'C': expon(scale=100),
                     'dual': [False]
                    }]
    hyper_tuning(LinearSVC(), tuned_parameters, X, y)
    
    # 0.849 (+/-0.015) for {'C': 27.534917537749216, 'dual': False, 'tol': 0.004802853730784135}
    # {'C': 34.49337686017465, 'dual': False, 'tol': 0.0004567829261173656} sample = 100000, F1.
    # 0.872 (+/-0.006) for {'C': 10.348501146284026, 'dual': False, 'tol': 0.00017506509292527104} Sample = 100k. Precision
    # 0.949 {'C': 32.560339433948236, 'dual': False, 'tol': 0.0014118363259887406} 100k, roc_auc
    #0.952 (+/-0.002) for {'C': 194.35726726323622, 'dual': False, 'tol': 0.003035578173309056} Sample = 200k. average precision
def tune_oneclass(X, y, expected_outliar_size):
    #     tuned_parameters = [{'kernel': ['rbf'],
    #                          'gamma': expon(scale=.1),
    #                          'nu': expon(scale=0.1)}]
    nu =  expected_outliar_size/len(X)
    tuned_parameters = [{'kernel': ['rbf'],
                         'gamma': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
                         'nu': [nu]}]
    return hyper_tuning(OneClassSVM(), tuned_parameters, X, y)
    # 0.500 (+/-0.846) for {'gamma': 0.0121072443425558, 'kernel': 'rbf', 'nu': 0.11932807423095282}
    # 0.351 (+/-0.589) for {'gamma': 0.5389663750979422, 'kernel': 'rbf', 'nu': 0.03258700301586109}
    # 0.477 (+/-0.807) for {'gamma': 0.03226814043225676, 'kernel': 'rbf', 'nu': 0.15264728096179458}
    # 0.490 (+/-0.829) for {'gamma': 0.03549584357487325, 'kernel': 'rbf', 'nu': 0.11916614740608962}
    # 0.486 (+/-0.822) for {'gamma': 0.04714615631138286, 'kernel': 'rbf', 'nu': 0.1208709207242509}

In [53]:
def model_performance_metrics(y_true, y_pred):
    metric_results_dict = {}
    metric_results_dict['accuracy'] = accuracy_score(y_true, y_pred)
    metric_results_dict['recall'] = recall_score(y_true, y_pred, average='binary')
    metric_results_dict['precision'] = precision_score(y_true, y_pred, average='binary')
    metric_results_dict['f1'] = f1_score(y_true, y_pred, average='binary')
    metric_results_dict['average_precision'] = average_precision_score(y_true, y_pred)
    metric_results_dict['confusion_matrix'] = get_confusion_matrix(y_true, y_pred)
    return metric_results_dict

In [54]:
def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test, y_test)
    print(model_performance_metrics(y_test, y_pred))

In [None]:
best_params = tune_oneclass(df_to_nump(X_train), y_train, len(benign_df))
clf = OneClassSVM(kernel=best_params['kernel'], nu=best_params['nu'], gamma=best_params['gamma'])
scaler = preprocessing.StandardScaler()
modeling_dict = fit_predict_model(clf, X_train, y_train, scaler)
clf = modeling_dict['model']
save_model(clf, './', 'oneclass')
self_predict_r = modeling_dict['self_predict']
print(model_performance_metrics(y_train, self_predict_r))
print(model_performance_metrics(y_test, clf.predict(X_test, y_test)))
print(model_performance_metrics(df_to_labels(benign_df), clf.predict(df_to_nump(benign_df), df_to_labels(benign_df))))

# Tuning hyper-parameters for average_precision



In [None]:
# 1. Linear Regression
# 2. Logistic Regression
# 3. CART
# 4. Naïve Bayes
# 5. KNN
# 6. Random Forests
# Logistic Regression, Support Vector Machine, Decision Tree, Random Forest, and Adaboosting
# RandomForest 1 0 1 1
# AdaBoostM1 1 0 1 1
# Bagging 1 0 1 1
# LogitBoost


In [None]:
# from sklearn.svm import OneClassSVM 
# train, test = train_test_split(data, test_size=.2) 
# train_normal = train[train['y']==0] 
# train_outliers = train[train['y']==1] 
# outlier_prop = len(train_outliers) / len(train_normal) 
# svm = OneClassSVM(kernel='rbf', nu=outlier_prop, gamma=0.000001) svm.fit(train_normal[['x1','x4','x5']])

In [None]:
# df_minority_upsampled = resample(df_minority, 
#                                  replace=True,     # sample with replacement
#                                  n_samples=576,    # to match majority class
#                                  random_state=123) # reproducible results

In [50]:
# def get_predict(X_train, X_test):
#     clf = svm.OneClassSVM(kernel="rbf", gamma='scale', cache_size=8000, nu=0.01)
#     scaler = preprocessing.StandardScaler().fit(X_train)
# #     scaler = preprocessing.RobustScaler().fit(X_train)
#     print(f'Training: {get_percentage(clf.fit_predict(scaler.transform(X_train)))}%')
#     benign_test = benign_df.to_numpy()
#     real_acc_results = clf.predict(scaler.transform(benign_test))
# #     real_df = pd.DataFrame(data={'Results': real_acc_results})
# #     print(real_df['Results'].value_counts())
# #     print(real_df.head(10))
#     print(f'Real Accuracy: {get_percentage(real_acc_results, False)}%' )
#     test_result = clf.predict(scaler.transform(X_test))
#     return test_result

In [None]:
# def get_decision(X_train, X_test):
#     clf = svm.OneClassSVM(kernel="rbf", gamma='scale', cache_size=5000)
#     scaler = preprocessing.StandardScaler().fit(X_train)
# #     scaler = preprocessing.RobustScaler().fit(X_train)
#     clf.fit(scaler.transform(X_train))
#     return np.sum(clf.score_samples(scaler.transform(X_test)))

In [None]:
# def get_percentage(arr, is_test=True):
#     total = len(arr)
#     count = 0
#     if is_test:
#         for item in arr:
#             if item == 1:
#                 count = count + 1
#     else:
#         for item in arr:
#             if item == -1:
#                 count = count + 1
#     return (count/total * 100)
#     print(f'Percentage: {(count/total * 100)}%')

In [None]:
# benign_df.head(10)

In [None]:
# X = feature_df.iloc[:200000].to_numpy()
# kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# add = 0
# score = []
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X[train_index], X[test_index]
#     start = time.time()
#     result = get_predict(X_train, X_test)
#     score.append(result)
# for single_test in score:
#     print(f'Test: {get_percentage(single_test)}%')

# for s in score:
#     add = add + get_percentage(s)
# mean = add/len(score)
# print(f'Mean Test: {mean}%')

In [None]:
# results = []
# for train_index, test_index in kf.split(X):
#     X_train, X_test = X[train_index], X[test_index]
#     results.append(get_predict(X_train, X_test))
#     print('One Done')

In [None]:
# get_percentage(clf_2.predict(scaler_2.transform(benign_df.to_numpy())), False)

In [None]:
# get_percentage(clf_2.predict(scaler_2.transform(final_data)))

In [None]:
# X = feature_x[:150000]
# Y = feature_y[:150000]

# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# for train_index, test_index in kf.split(X, Y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = Y[train_index], Y[test_index]
#     clf_2 = svm.OneClassSVM(kernel="rbf", gamma='scale', cache_size=400, nu=0.01)
#     scaler_2 = preprocessing.StandardScaler().fit(X_train)
#     clf_2.fit(scaler_2.transform(X_train))
#     result = clf_2.predict(scaler_2.transform(X_test))
#     print((len([ res for res in result if res == 1])/len(result))*100)

In [None]:
# accuracy, precicion , recall, confusion matrix

In [None]:
# raw+ discretized + engineered + real label + predicted label + confidence score