# Network Traffic Analysis Pipeline

## 0. Setup Environment

0.1. Install all needed dependencies


In [None]:
%pip install pyshark
%pip install nest_asyncio
%pip install pandas
%pip install pymongo
%pip install python-dotenv

0.2. Allow the notebook to work assynchronously.

In [13]:
import nest_asyncio
nest_asyncio.apply()

0.3. Setup global variables

In [14]:
run_seed=0
base_training_percentage = 60
online_training_percentage = 20
validation_percentage = 20

## 1. Data Gathering

1.1. Download datasets

1.2. Store datasets in a local folder

1.3. Select and label the datasets.
For each dataset, add it to the list of datasets in a tuple with the format `(dataset_path, dataset_label)`, where:
- `dataset_path` is the path to the dataset file
- `dataset_label` is the label that should be assigned to the entries, this can be either `normal`,`anomaly` or `unknown`(for unsupervised training)

In [15]:
dataset_files = []

## 2. Data Preprocessing

2.1. Import the desired preprocessor.

In [16]:
from src.preprocessors.pcap_preprocessor import PcapPreprocessor

preprocessor = PcapPreprocessor()

2.2. Load and preprocess the datasets.

    This step may depend on the preprocessor used. 
    The preprocessor should store the data and preprocess the data, providing methods to access it.
    In the case of classes that don't store the data in memory, this step can be skipped after the first execution. 


In [17]:
if len(dataset_files) > 0:
    preprocessor.load_datasets(dataset_files)

## 3. Data Visualization

3.1. Gather all the data in a single dataframe for visualization.

In [18]:
all_data = preprocessor.get_all_data()

3.X. Add vizualization steps here

In [None]:
print(all_data.info())

## 4. Model Training

4.1. Select and import the desired models.

In [21]:
from src.models.offline import OfflineModel
from src.models.online import OnlineModel

model_list = [OfflineModel(), OnlineModel()]

4.2. Gather the data for training.

When training online models a second dataframe is provided to simulate the online training.

In [None]:
training_data,online_training_data = preprocessor.get_training_data(base_training_percentage, online_training_percentage, True, seed=run_seed)

print(training_data.info())
print(online_training_data.info())

4.3. Train the models using the offline data.

In [23]:
for model in model_list:
    model.train(training_data)

4.4. Train the online models simulating an online enviorment.

In [24]:
for model in model_list:
    if model.is_online():
        model.predict_batch(online_training_data)

## 5. Model Evaluation

5.1. Select and import the desired evaluator.

In [None]:
from src.evaluators.standard_evaluator import Evaluator

evaluator = Evaluator()

5.2. Gather the data for evaluation.

In [None]:
validation_data = preprocessor.get_validation_data(validation_percentage,seed=run_seed)

5.3. Get the predictions from all models into a dataframe.

In [None]:
# Header
header = ['expected']
for model in model_list:
    header.append(model.get_name())
validation_results = [header]

# Results
for index, row in validation_data.iterrows():
    line = [row['label']]
    unlabelled_row = row.drop('label')
    for model in model_list:
        line.append(model.predict(unlabelled_row))
    validation_results.append(line)
 
df = pd.DataFrame(validation_results)

5.4. Evaluate the models using their predictions.

In [None]:
results = evaluator.evaluate(df)

5.5. Analyze the results.

In [None]:
print(results)