# Network Traffic Analysis Pipeline

## 0. Setup Environment

0.1. Install all needed dependencies


In [1]:
%pip install pyshark
%pip install nest_asyncio
%pip install pandas
%pip install pymongo
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


0.2. Allow the notebook to work assynchronously.

In [2]:
import nest_asyncio
nest_asyncio.apply()

0.3. Setup global variables

In [3]:
run_seed=0
base_training_percentage = 60
online_training_percentage = 20
validation_percentage = 20

## 1. Data Gathering

1.1. Download datasets

1.2. Store datasets in a local folder

1.3. Select and label the datasets.
For each dataset, add it to the list of datasets in a tuple with the format `(dataset_path, dataset_label)`, where:
- `dataset_path` is the path to the dataset file
- `dataset_label` is the label that should be assigned to the entries, this can be either `normal`,`anomaly` or `unknown`(for unsupervised training)

In [4]:
dataset_files = []

## 2. Data Preprocessing

2.1. Import the desired preprocessor.

In [5]:
from src.preprocessors.pcap_preprocessor import PcapPreprocessor

preprocessor = PcapPreprocessor()

2.2. Load and preprocess the datasets.

    This step may depend on the preprocessor used. 
    The preprocessor should store the data and preprocess the data, providing methods to access it.
    In the case of classes that don't store the data in memory, this step can be skipped after the first execution. 


In [6]:
if len(dataset_files) > 0:
    preprocessor.load_datasets(dataset_files)

## 3. Data Visualization

3.1. Gather all the data in a single dataframe for visualization.

In [7]:
all_data = preprocessor.get_all_data()

3.X. Add vizualization steps here

In [8]:
print(all_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210000 entries, 0 to 209999
Columns: 1204 entries, _id to tls.record_opaque_type
dtypes: float64(498), int64(3), object(703)
memory usage: 1.9+ GB
None


## 4. Model Training

4.1. Select and import the desired models.

In [9]:
from src.models.offline import OfflineModel
from src.models.online import OnlineModel

model_list = [OfflineModel(), OnlineModel()]

ImportError: cannot import name 'OfflineModel' from 'src.models.offline' (unknown location)

4.2. Gather the data for training.

When training online models a second dataframe is provided to simulate the online training.z

In [10]:
training_data,online_training_data = preprocessor.get_training_data(base_training_percentage, online_training_percentage, True, seed=run_seed)

print(training_data.info())
print(online_training_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126000 entries, 0 to 125999
Columns: 1147 entries, _id to tls.record_size_limit
dtypes: float64(472), int64(3), object(672)
memory usage: 1.1+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 1119 entries, _id to udp._ws_expert_group
dtypes: float64(459), int64(3), object(657)
memory usage: 358.6+ MB
None


4.3. Train the models using the offline data.

In [23]:
for model in model_list:
    model.train(training_data)

4.4. Train the online models simulating an online enviorment.

In [24]:
for model in model_list:
    if model.is_online():
        model.predict_batch(online_training_data)

## 5. Model Evaluation

5.1. Select and import the desired evaluator.

In [None]:
from src.evaluators.standard_evaluator import Evaluator

evaluator = Evaluator()

5.2. Gather the data for evaluation.

In [None]:
validation_data = preprocessor.get_validation_data(validation_percentage,seed=run_seed)

5.3. Get the predictions from all models into a dataframe.

In [None]:
# Header
header = ['expected']
for model in model_list:
    header.append(model.get_name())
validation_results = [header]

# Results
for index, row in validation_data.iterrows():
    line = [row['label']]
    unlabelled_row = row.drop('label')
    for model in model_list:
        line.append(model.predict(unlabelled_row))
    validation_results.append(line)
 
df = pd.DataFrame(validation_results)

5.4. Evaluate the models using their predictions.

In [None]:
results = evaluator.evaluate(df)

5.5. Analyze the results.

In [None]:
print(results)