# Model training

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import precision_score, confusion_matrix, recall_score, accuracy_score, f1_score
from statistics import mean
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from joblib import dump


In [2]:
pd.set_option('display.max_columns', None)

## 1- LOADING PROCESSED DATASET

In [3]:
data_df = pd.read_csv("../../data/processed/conn_log_labeled-processed.csv", index_col=0)

#### See the dataset information

In [4]:
data_df.shape

(23145, 31)

In [5]:
data_df.head()

Unnamed: 0,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,proto_tcp,proto_udp,service_dhcp,service_dns,service_http,service_irc,conn_state_OTH,conn_state_RSTR,conn_state_S0,conn_state_S1,conn_state_S3,conn_state_SF,history_C,history_D,history_Dd,history_Other,history_S,history_ShAdDaf,history_ShAdDaft,history_ShAdfDr
0,0.628686,0.001238,0.62005,0.0,0.0,0.0,0.000163,2.366458e-06,0.0,0.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.628686,0.001238,0.620022,3.097425e-07,4.7e-05,0.0,5.4e-05,7.888192e-07,0.0,0.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.628686,0.001238,0.620022,3.097425e-07,4.7e-05,0.0,5.4e-05,7.888192e-07,0.0,0.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.628686,0.001238,0.221583,1.972292e-06,0.780758,0.5,0.005097,7.26371e-05,0.08972,0.823184,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.628717,0.001238,0.621946,0.0,0.0,0.0,0.000163,2.366458e-06,0.0,0.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## 2- Let's move to make a model and train it
### a- make the data ready

In [6]:
data_X = data_df.drop("label", axis=1)
data_y = data_df["label"]

In [7]:
data_X.head()

Unnamed: 0,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto_tcp,proto_udp,service_dhcp,service_dns,service_http,service_irc,conn_state_OTH,conn_state_RSTR,conn_state_S0,conn_state_S1,conn_state_S3,conn_state_SF,history_C,history_D,history_Dd,history_Other,history_S,history_ShAdDaf,history_ShAdDaft,history_ShAdfDr
0,0.628686,0.001238,0.62005,0.0,0.0,0.0,0.000163,2.366458e-06,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.628686,0.001238,0.620022,3.097425e-07,4.7e-05,0.0,5.4e-05,7.888192e-07,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.628686,0.001238,0.620022,3.097425e-07,4.7e-05,0.0,5.4e-05,7.888192e-07,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.628686,0.001238,0.221583,1.972292e-06,0.780758,0.5,0.005097,7.26371e-05,0.08972,0.823184,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.628717,0.001238,0.621946,0.0,0.0,0.0,0.000163,2.366458e-06,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
data_y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

### b- Initializing classification models

In [9]:
classifiers = [("Decision Tree", DecisionTreeClassifier()),("Logistic Regression", LogisticRegression()),("Random Forest", RandomForestClassifier()),("K-Nearest Neighbors", KNeighborsClassifier())]

Cross validation technique :
- In order to obtain better representative results of the performance of each model across several iterations, we use cross-validation instead of the regular train/test split.
- Since we are dealing with imbalanced class distributions, we implement a Stratified K-Folds cross-validator instead of the random KFold sampling. This is useful to preserve the percentage of both labels in each fold.  

In [10]:
#here we choose to have 5 splits (folds) for our cross validation problem
skf_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

### c- Let's train

In [11]:
print("Model Training Started!")
# the results summary
classification_results = pd.DataFrame(index=[c[0] for c in classifiers], columns=["Accuracy", "TN", "FP", "FN", "TP", "Recall", "Precision", "F1"])
# Iterate over the estimators
for estimator_name, estimator_object in classifiers:
    print(f"---- [{estimator_name}]: working ...")
    # Initialize the results for each classifier
    accuracy_scores = []
    confusion_matrices = []
    recall_scores = []
    precision_scores = []
    f1_scores = []
    # Initialize best model object to be saved
    models_path = "../../models"
    best_model = None
    best_f1 = -1
    # Iterate over the obtained folds
    for train_i, test_i in skf_cv.split(data_X, data_y):
        '''
        Cross-validation is a technique used to evaluate the performance of a machine learning model.
        It involves splitting the dataset into multiple subsets (folds) and then training and evaluating the model multiple times,
        each time using a different subset as the test set and the remaining subsets as the training set.
        This helps in assessing how the model generalizes to an independent dataset.
        '''
        #split the data into training and test data
        X_train, X_test = data_X.iloc[train_i], data_X.iloc[test_i]
        y_train, y_test = data_y.iloc[train_i], data_y.iloc[test_i]
        # Train the model
        estimator_object.fit(X_train.values, y_train.values)
        # Predict the test samples
        y_pred = estimator_object.predict(X_test.values)      
        # Calculate and register accuracy metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        est_f1_score = f1_score(y_test, y_pred)
        f1_scores.append(est_f1_score)
        # Compare with best performing model
        if best_f1 < est_f1_score:
            best_model = estimator_object
            best_f1 = est_f1_score
    print("accuracy_scores = ",accuracy_score(y_test, y_pred))
    # Summarize everything
    tn, fp, fn, tp = sum(confusion_matrices).ravel()
    classification_results.loc[estimator_name] = [mean(accuracy_scores),tn,fp,fn,tp,mean(recall_scores),mean(precision_scores),mean(f1_scores)]
    # Save the best performing model
    if best_model:
        model_name = estimator_name.replace(' ', '_').replace('-', '_').lower()
        model_file = model_name + ".pkl"
        dump(best_model, models_path + "/" + estimator_name.replace(' ', '_').replace('-', '_').lower()+ ".pkl")
    
print("Model Training Finished!")   
    

Model Training Started!
---- [Decision Tree]: working ...
accuracy_scores =  1.0
---- [Logistic Regression]: working ...
accuracy_scores =  0.9948152948801037
---- [Random Forest]: working ...
accuracy_scores =  0.9997839706200043
---- [K-Nearest Neighbors]: working ...
accuracy_scores =  0.9976236768200475
Model Training Finished!


In [12]:
# Check the results
classification_results

Unnamed: 0,Accuracy,TN,FP,FN,TP,Recall,Precision,F1
Decision Tree,0.999914,1923,0,2,21220,0.999906,1.0,0.999953
Logistic Regression,0.994815,1832,91,29,21193,0.998633,0.995725,0.997177
Random Forest,0.999827,1923,0,4,21218,0.999812,1.0,0.999906
K-Nearest Neighbors,0.99771,1880,43,10,21212,0.999529,0.997977,0.998752


### So here we notice that all the models have high accuracy but the best one is "Decision Tree" and "Random Forest" , well anything related to nature :p