# Network Traffic Analysis Pipeline

## 0. Setup Environment

0.1. Install all needed dependencies


In [1]:
%pip install pyshark
%pip install nest_asyncio
%pip install pandas
%pip install pymongo
%pip install python-dotenv
%pip install scikit-learn
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


0.2. Allow the notebook to work assynchronously.

In [2]:
import nest_asyncio
nest_asyncio.apply()

0.3. Setup global variables

In [3]:
import time
import sys
sys.path.append('..')
run_seed=int(time.time())
base_training_percentage = 60
online_training_percentage = 20
validation_percentage = 20

## 1. Data Gathering

1.1. Download datasets

1.2. Store datasets in a local folder

1.3. Select and label the datasets.
For each dataset, add it to the list of datasets in a tuple with the format `(dataset_path, dataset_label)`, where:
- `dataset_path` is the path to the dataset file
- `dataset_label` is the label that should be assigned to the entries, this can be either `normal`,`anomaly` or `unknown`(for unsupervised training)

In [4]:
dataset_files = []

## 2. Data Preprocessing

2.1. Import the desired preprocessor.

In [5]:
from src.preprocessors.pcap_preprocessor import PcapPreprocessor

preprocessor = PcapPreprocessor()

2.2. Load and preprocess the datasets.

    This step may depend on the preprocessor used. 
    The preprocessor should store the data and preprocess the data, providing methods to access it.
    In the case of classes that don't store the data in memory, this step can be skipped after the first execution. 


In [6]:
if len(dataset_files) > 0:
    preprocessor.load_datasets(dataset_files)

## 3. Data Visualization

3.1. Gather all the data in a single dataframe for visualization.

In [7]:
import pandas as pd
all_data = preprocessor.get_all_data()

3.X. Add vizualization steps here

In [8]:
print(all_data.info())
for col in all_data.columns: 
    print(col)

## 4. Model Training

4.1. Select and import the desired models.

In [9]:
from src.models.offline.offline1 import Offline_RandomForest
from src.models.online.online1 import Online_RandomForest


model_list = [Offline_RandomForest(), Online_RandomForest()]

Model list:  [<src.models.offline.offline1.Offline_RandomForest object at 0x77942cfe3ec0>, <src.models.online.online1.Online_RandomForest object at 0x7793f6ac3980>]


4.2. Gather the data for training.

When training online models a second dataframe is provided to simulate the online training.z

In [10]:
training_data,online_training_data = preprocessor.get_training_data(base_training_percentage, online_training_percentage, True, seed=run_seed)

print(training_data.info())
print(online_training_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Columns: 1391 entries, _id to arp.isannouncement
dtypes: float64(555), object(836)
memory usage: 1.2+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38000 entries, 0 to 37999
Columns: 1323 entries, _id to dhcpv6.requested_option_code
dtypes: float64(533), object(790)
memory usage: 383.6+ MB
None


4.3. Train the models using the offline data.

In [11]:
for model in model_list:
    model.train(training_data)

Validation Results:
              precision    recall  f1-score   support

     anomaly       1.00      1.00      1.00     10920
      normal       1.00      1.00      1.00     11880

    accuracy                           1.00     22800
   macro avg       1.00      1.00      1.00     22800
weighted avg       1.00      1.00      1.00     22800



4.4. Train the online models simulating an online enviorment.

In [12]:
for model in model_list:
    if model.is_online():
        for index, row in online_training_data.iterrows():
            model.predict(row)

KeyboardInterrupt: 

## 5. Model Evaluation

5.1. Select and import the desired evaluator.

In [13]:
from src.evaluators.standard_evaluator import Evaluator

evaluator = Evaluator()

5.2. Gather the data for evaluation.

In [20]:
validation_data = preprocessor.get_validation_data(validation_percentage,seed=run_seed)

5.3. Get the predictions from all models into a dataframe.

In [21]:
# Header
header = ['expected']
for model in model_list:
    header.append(model.id)
validation_results = [header]

# Results
labels = validation_data['label']
validation_data = validation_data.drop(columns=['label'])
model_results = []
for model in model_list:
    model_results.append(model.predict_batch(validation_data))

for i in range(len(labels)):
    row = [labels[i]]
    for model_result in model_results:
        row.append(model_result[i])
    validation_results.append(row)

df = pd.DataFrame(validation_results)
print(df)

             0         1        2
0     expected  Offline1  Online1
1      anomaly    normal  anomaly
2      anomaly    normal  anomaly
3      anomaly    normal  anomaly
4      anomaly    normal  anomaly
...        ...       ...      ...
1996   anomaly   anomaly  anomaly
1997   anomaly   anomaly  anomaly
1998   anomaly   anomaly  anomaly
1999   anomaly   anomaly  anomaly
2000   anomaly   anomaly  anomaly

[2001 rows x 3 columns]


5.4. Evaluate the models using their predictions.

In [22]:
results = evaluator.evaluate(df)

5.5. Analyze the results.

In [23]:
print(results)

   accuracy
1   0.31934
2   0.99950
