# Basic data and Feature preparation
- Load parquet data 
- Select features

And basicly, prepare pandas dataframe for manipulation




## About SVM and Results
- Since the data is not linearly separable, we use SVM with kernel trick
- We use our own implementation of GridSearch to find optimal parameters. 

1. Take smaller batch of data, like 1-2% of the whole data and tune parameters
2. With tuned parameters than train the main classifier. It can take a looong time. 
   
SVM training time is not linear, it is quadratic. So, if you double the data, it will take 4 times longer to train and it really can be run on gpu.
So take a deep breath, beefed up computer and run it. (hours)
- for 10% data ~ 20min
- for 50% data ~ 2h
- for 20% data ~ 1h
- for 100% data ~ 10h


### Results
The more data you use, the better results you get.
For 1% of data, we got around ~0.6 f1 score
For 20% of the data, we got around ~0.79 f1 score


I expect around 0.9-0.96 f1 score for 100% of the data.... But it needs to be tested

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import time

phishing = pq.read_table('./floor/phishing_2307.parquet')
benign = pq.read_table('./floor/benign_2307.parquet')

from transformers.drop_nontrain import drop_nontrain_table as drop_nontrain
phishing = drop_nontrain(phishing)
benign = drop_nontrain(benign)

# realign schemas (parquet files save in nonsense orders)
benign = benign.cast(phishing.schema)

# concatentate tables
data = pa.concat_tables([phishing, benign])
df = data.to_pandas()

from transformers.cast_timestamp import cast_timestamp
df = cast_timestamp(df)

used_features = [
    
    # IP  ===============================
    # old (Adam) & still used
    "ip_mean_average_rtt",
    "ip_entropy",
    
    # new
    "ip_count", "ip_v4_count", "ip_v6_count",
    
    
    # DNS  ===============================
    # old (Adam) & still used
    "dns_A_count",
    "dns_AAAA_count",
    "dns_CNAME_count",
    "dns_MX_count",
    "dns_NS_count",
    "dns_SOA_count",
    "dns_TXT_count",
    "dns_soa_primary_ns_len",
    "dns_soa_primary_ns_level", # renamed
    "dns_soa_primary_ns_digit_count",
    "dns_soa_primary_ns_entropy",
    "dns_soa_email_len",
    "dns_soa_email_level", # renamed
    "dns_soa_email_digit_count",
    "dns_soa_email_entropy",
    "dns_soa_serial",
    "dns_soa_refresh",
    "dns_soa_retry",
    "dns_soa_expire",
    #"dns_soa_neg_resp_caching_ttl",
    "dns_mx_mean_len",
    "dns_mx_mean_entropy",
    "dns_domain_name_in_mx",
    #"dns_txt_google_verified",
    "dns_txt_spf_exists",
    "dns_txt_mean_entropy",
    
    # new
    "dns_txt_dkim_exists",
    
    # TLS  ===============================
    # old (Adam) & still used
    
    "tls_broken_chain",
    "tls_expired_chain",
    "tls_total_extension_count",
    "tls_critical_extensions",
    "tls_with_policies_crt_count",
    "tls_percentage_crt_with_policies",
    "tls_x509_anypolicy_crt_count",
    "tls_iso_policy_crt_count",
    "tls_joint_isoitu_policy_crt_count",
    "tls_iso_policy_oid",
    "tls_isoitu_policy_oid",
    #"tls_unknown_policy_crt_count", <-- Abandoned, no useful values
    "tls_subject_count",
    "tls_server_auth_crt_count",
    "tls_client_auth_crt_count",
    "tls_CA_certs_in_chain_ratio",
    "tls_unique_SLD_count",
    "tls_common_name_count",
    "tls_root_cert_validity_len",
    "tls_leaf_cert_validity_len",
    
    # new
    "tls_chain_len",
    "tls_root_cert_lifetime",
    "tls_leaf_cert_lifetime",
    
    
    # LEX ===============================
    # old (Adam) & still used
    "lex_name_len",
    #"lex_digit_count", <-- abandoned, almost the same as "lex_sub_digit_ratio"
    "lex_has_digit",
    "lex_phishing_keyword_count",
    "lex_vowel_count",
    "lex_underscore_hyphen_count",
    "lex_consecutive_chars",
    "lex_tld_len",
    "lex_sld_len",
    "lex_sub_count",
    "lex_stld_unique_char_count",
    "lex_begins_with_digit",
    "lex_www_flag",
    "lex_sub_max_consonant_len",
    "lex_sub_norm_entropy",
    "lex_sub_digit_count",
    "lex_sub_digit_ratio",
    "lex_sub_consonant_ratio",
    "lex_sub_non_alphanum_ratio",
    "lex_sub_hex_ratio",
    # new
    "lex_sld_norm_entropy", # <-- newly added feature on 24-09-29
    
    # nothing
    
    # RDAP ===============================
    # old (Adam) & still used
    "rdap_registration_period",
    "rdap_has_dnssec",
    
    # new
    "rdap_domain_age",
    "rdap_time_from_last_change",
    "rdap_domain_active_time",
    
    # GEO ===============================
    # old (Adam) & still used
    "geo_countries_count",
    "geo_continent_hash",
    "geo_countries_hash"
]

print("Number of used features:", len(used_features))

df = df[["label", *used_features]]

80


# (Optional) GPU and CUDA initialization
- Normaly svm is not ideal for GPU, but feel free to play with it

In [2]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("Using: ", torch.cuda.get_device_name(device))

Using:  NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [11]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd


########################################################
# IMPORTANT: Dataset reduction for gridearch purposses #
########################################################
# For real training use fraction 1.0
df_mini = df.sample(frac=0.4, random_state=1)

# dump mini batch to csv
#df_mini.to_csv('mini_batch.csv', index=False)

from sklearn.model_selection import train_test_split


class_map = {"benign_2307:unknown": 0, "misp_2307:phishing": 1}

labels = df_mini['label'].apply(lambda x: class_map[x]) # y vector
features = df_mini.drop('label', axis=1).copy() # X matrix

X_train, X_test, y_train, y_test = train_test_split(
features,
labels,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=labels
)

# fill nans with 0 in X_train and X_test and y_train and y_test
    
x_train = X_train.fillna(0)
x_test = X_test.fillna(0)
    
y_train = y_train.fillna(0)
y_test = y_test.fillna(0)
    
# convert x_train to numpy array
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
    
y_test = y_test.to_numpy()
x_test = x_test.to_numpy()
    
    # Converting False and True to 0 and 1
x_train = np.where(x_train == False, 0, x_train)
x_train = np.where(x_train == True, 1, x_train)
    
x_test = np.where(x_test == False, 0, x_test)
x_test = np.where(x_test == True, 1, x_test)
    


feature_count = x_train.shape[1]
sample_count = x_train.shape[0]

# print number of samples and features
print("Number of samples: ", sample_count)

Number of samples:  150260


# SVM Training core
1. Minmax data scale, (optimal would be do some categorical encoding...)
2. Core SVM training function

In [8]:

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler


# MinMax data scaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# anotate function 


'''
    @input kernel: string (rbf, linear, poly, sigmoid)
    @input class_weight: dict or 'balanced'
    @input C: float (default=1.0)
    @input gamma: float (default='scale')
    
    @return accuracy_score: float
    @return f1_score: float
'''
def fit_svm(kernel, class_weight, C, gamma):
    svm = SVC(kernel=kernel, class_weight=class_weight, C=C, gamma=gamma, verbose=False)
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)

    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

# Grid search parameters

To find optimal parameters for SVM, we use grid search.
However, it is very time consuming, so we use only 10% of data for grid search. Or less. Even 1% is enough.

In [9]:
# for rbf kernel, find optimal C and gamma using grid search and use f1 metric for scoring
param_grid = {
    'C': [50, 100, 150, 200, 300, 500, 1000],  # Centering around C=100
    'gamma': [0.1, 0.5, 1, 2, 5],  # Centering around gamma=1
    'kernel': ['rbf']  # Given 'rbf' gave the best result, let's focus on it
}

experimental_grid = {
    'C': [0.1, 1, 10, 50, 100, 150, 200, 500, 1000, 2000, 5000],  # Wide range of C values
    'gamma': [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 'scale', 'auto'],  # Wide range of gamma values
    'kernel': ['rbf']  # Given 'rbf' gave the best result, let's focus on it
}

optimal_grid = {
    'C': [35, 40, 45, 50, 55, 60],  # Centering around C=100
    'gamma': [0.8, 1, 1.2, 1.3, 1.4, 1.5],  # Centering around gamma=1
    'kernel': ['rbf']  # Given 'rbf' gave the best result, let's focus on it
}

refined_grid = {
    'C': [48, 49, 50, 51, 52, 53],
    'gamma': [0.95, 0.96, 0.97, 0.98, 0.99, 1, 1.01, 1.02, 1.03, 1.04, 1.05],
    'kernel': ['rbf']
}

exploratory_grid = {
    'C': [0.1, 1, 10, 50, 75, 100, 150, 200, 500, 1000, 2000, 5000],
    'gamma': [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 'scale', 'auto'],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

coarse_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['rbf']
}

# Use on your own risk and on very powerful machine
giga_grid = {
    'C': sorted(set([50, 100, 150, 200, 300, 500, 1000] + 
                    [0.1, 1, 10, 50, 100, 150, 200, 500, 1000, 2000, 5000] +
                    [35, 40, 45, 50, 55, 60] +
                    [48, 49, 50, 51, 52, 53] +
                    [0.1, 1, 10, 50, 75, 100, 150, 200, 500, 1000, 2000, 5000] +
                    [0.1, 1, 10, 100, 1000])),
    'gamma': sorted(set([0.1, 0.5, 1, 2, 5] +
                        [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 'scale', 'auto'] +
                        [0.8, 1, 1.2, 1.3, 1.4, 1.5] +
                        [0.95, 0.96, 0.97, 0.98, 0.99, 1, 1.01, 1.02, 1.03, 1.04, 1.05] +
                        [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 'scale', 'auto'] +
                        [0.001, 0.01, 0.1, 1, 10])),
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']  # Including all kernel types as per exploratory_grid
}


# select GRID that you want to use, feel free to add your own
# The default tested grid is optimal_grid
param_grid = optimal_grid


In [10]:
# Do estimation of time to complete grid search
print("Number of combinations: ", len(param_grid['kernel']) * len(param_grid['C']) * len(param_grid['gamma']))


# compute one iteration time

start = time.time()
fit_svm('rbf', 'balanced', 0.001, 0.0001)
end = time.time()

iteration_time = end - start

# total time in minutes
mins_total = (end - start) * len(param_grid['kernel']) * len(param_grid['C']) * len(param_grid['gamma']) / 60




print("Expected time to complete grid search: ", round(mins_total, 2), " minutes")



highest_f1 = 0
highest_params = []

params = []

for kernel in param_grid['kernel']:
    for C in param_grid['C']:
        for gamma in param_grid['gamma']:
            accuracy, f1 = fit_svm(kernel, 'balanced', C, gamma)
            if f1 >= highest_f1:
                highest_f1 = f1
                highest_params = [kernel, C, gamma]
                #save all params, f1 and accuracy to params list
                params.append([kernel, C, gamma, f1, accuracy])
                if(highest_f1 > 0):
                    print("F1 status:", highest_f1, "with accuracy:", accuracy, "and params: ", highest_params)
            print("=========================================")
            
            
print("DONE, highest precision: ", highest_f1, "with accuracy:", accuracy, "and params: ", highest_params)






Number of combinations:  36
Expected time to complete grid search:  0.39  minutes
F1 status: 0.6507936507936507 with accuracy: 0.9531914893617022 and params:  ['rbf', 35, 0.8]
F1 status: 0.6559999999999999 with accuracy: 0.9542553191489361 and params:  ['rbf', 35, 1]
F1 status: 0.6612903225806451 with accuracy: 0.9553191489361702 and params:  ['rbf', 40, 1]
F1 status: 0.6612903225806451 with accuracy: 0.9553191489361702 and params:  ['rbf', 45, 1]
F1 status: 0.6612903225806451 with accuracy: 0.9553191489361702 and params:  ['rbf', 50, 1]
F1 status: 0.6612903225806451 with accuracy: 0.9553191489361702 and params:  ['rbf', 55, 1]
DONE, highest precision:  0.6612903225806451 with accuracy: 0.95 and params:  ['rbf', 55, 1]


In [12]:
# Final SVM model training with best params that you will find using grid search

print(fit_svm('rbf', 'balanced', 50, 1.0))

In [71]:
# Import necessary libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


# Train Naive Bayes model and display results
print("Naive Bayes:")
var_smoothing_option = float(input("Enter var_smoothing option (default is 1e-9): ") or 1e-9)
gnb = GaussianNB(var_smoothing=var_smoothing_option)
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {acc}, F1 Score: {f1}")
print("\n")

# Train Logistic Regression model and display results
print("Logistic Regression:")
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {acc}, F1 Score: {f1}")



Naive Bayes:
Accuracy: 0.5574468085106383, F1 Score: 0.21509433962264152


Logistic Regression:
Accuracy: 0.9372340425531915, F1 Score: 0.28915662650602414
