# Network Traffic Analysis Pipeline

## 0. Setup Environment

In [1]:
%pip install pyshark
%pip install nest_asyncio
%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Allow the notebook to work assynchronously.

In [2]:
import nest_asyncio
nest_asyncio.apply()

## 1. Data Gathering

In [3]:
dataset_files = ['data/2024-10-23-Redline-Stealer-infection-traffic.pcap']

## 2. Data Preprocessing

1. Import the desired preprocessor.

In [4]:
from src.preprocessors.pcap_preprocessor import PcapPreprocessor

preprocessor = PcapPreprocessor()

2. Load the datasets into a dataframe.

In [5]:
base_data = preprocessor.load_datasets(dataset_files)
print(base_data)

{
    "eth": {
        "dst": "20:e5:2a:b6:93:f1",
        "dst_resolved": "20:e5:2a:b6:93:f1",
        "dst_oui": 2155818,
        "dst_oui_resolved": "Netgear",
        "dst_lg": false,
        "dst_ig": false,
        "addr": "20:e5:2a:b6:93:f1",
        "addr_resolved": "20:e5:2a:b6:93:f1",
        "addr_oui": 2155818,
        "addr_oui_resolved": "Netgear",
        "lg": false,
        "ig": false,
        "src": "00:08:02:1c:47:ae",
        "src_resolved": "00:08:02:1c:47:ae",
        "src_oui": 2050,
        "src_oui_resolved": "Hewlett Packard",
        "src_lg": false,
        "src_ig": false,
        "type": "0x0800",
        "stream": 0
    },
    "ip": {
        "version": 4,
        "hdr_len": 20,
        "dsfield": "0x00",
        "dsfield_dscp": 0,
        "dsfield_ecn": 0,
        "len": 52,
        "id": "0x534c",
        "flags": "0x02",
        "flags_rb": false,
        "flags_df": true,
        "flags_mf": false,
        "frag_offset": 0,
        "ttl": 128,
      

KeyboardInterrupt: 

3. Preprocess the dataframe.

In [None]:
preprocessor.preprocess_dataframe(base_data)

4. Partition the dataframe into training and testing sets.

In [None]:
base_training_percentage = 0.6
online_training_percentage = 0.2
validation_percentage = 0.2

training_data, online_training_data, validation_data = preprocessor.split_dataframe(processed_data, base_training_percentage, online_training_percentage, validation_percentage)

## 3. Model Training

1. Select the desired models.

In [None]:
from src.models.offline import OfflineModel
from src.models.online import OnlineModel

model_list = [OfflineModel(), OnlineModel()]

2. Train the models according to their type.

In [None]:
for model in model_list:
    model.train(training_data)
    if model.is_online():
        model.predict_batch(online_training_data)

## 4. Model Evaluation

1. Get the predictions from the models into a dataframe.

In [None]:
# Header
header = ['expected']
for model in model_list:
    header.append(model.get_name())
validation_results = [header]

# Results
for index, row in validation_data.iterrows():
    line = [row['label']]
    for model in model_list:
        line.append(model.predict(row))
    validation_results.append(line)
 
df = pd.DataFrame(validation_results)

2. Select the desired evaluator.

In [None]:
from src.evaluators.standard_evaluator import Evaluator

evaluator = Evaluator()

3. Evaluate the models using the predictions.

In [None]:
results = evaluator.evaluate(df)

4. Analyze the results.

In [None]:
print(results)