# Network Traffic Analysis Pipeline

## 0. Setup Environment

In [1]:
%pip install pyshark
%pip install nest_asyncio
%pip install pandas
%pip install pymongo
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Allow the notebook to work assynchronously.

In [2]:
import nest_asyncio
nest_asyncio.apply()
run_seed=0

## 1. Data Gathering

In [3]:
dataset_files = [('data/allow/normal_1.pcap','allow'),('data/deny/normal_DDoS_1.pcap','deny'),('data/deny/injection_normal1.pcap','deny')]

## 2. Data Preprocessing

1. Import the desired preprocessor.

In [4]:
from src.preprocessors.pcap_preprocessor import PcapPreprocessor

preprocessor = PcapPreprocessor()

2. Load the datasets into a dataframe.

In [5]:
preprocessor.load_datasets(dataset_files)

3. Preprocess the dataframe.

In [5]:
preprocessor.preprocess_dataframe()

True

4. Partition the dataframe into training and testing sets.

In [7]:
base_training_percentage = 60
online_training_percentage = 20
validation_percentage = 20

training_data, online_training_data = preprocessor.get_training_data(base_training_percentage, online_training_percentage, True, seed=run_seed)
validation_data, labels = preprocessor.get_validation_data(validation_percentage,seed=run_seed)

In [10]:
print(training_data.info())
print(online_training_data.info())
print(validation_data.info())
print(labels.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Columns: 936 entries, _id to dns.unsolicited
dtypes: float64(375), int64(3), object(558)
memory usage: 128.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 18000 to 23999
Columns: 936 entries, _id to dns.unsolicited
dtypes: float64(375), int64(3), object(558)
memory usage: 42.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 24000 to 29999
Columns: 935 entries, _id to dns.unsolicited
dtypes: float64(375), int64(3), object(557)
memory usage: 42.8+ MB
None
<class 'pandas.core.series.Series'>
RangeIndex: 6000 entries, 24000 to 29999
Series name: label
Non-Null Count  Dtype 
--------------  ----- 
6000 non-null   object
dtypes: object(1)
memory usage: 47.0+ KB
None
                            _id                           dataset  \
24000  67351a3309d842ac6bb9ab2d          data/allow/normal_1.pcap   
24001  67351a3309d842ac6bb9c147          data/allow/normal_1.pcap

## 3. Model Training

1. Select the desired models.

In [None]:
from src.models.offline import OfflineModel
from src.models.online import OnlineModel

model_list = [OfflineModel(), OnlineModel()]

2. Train the models according to their type.

In [None]:
for model in model_list:
    model.train(training_data)
    if model.is_online():
        model.predict_batch(online_training_data)

## 4. Model Evaluation

1. Get the predictions from the models into a dataframe.

In [None]:
# Header
header = ['expected']
for model in model_list:
    header.append(model.get_name())
validation_results = [header]

# Results
for index, row in validation_data.iterrows():
    line = [row['label']]
    for model in model_list:
        line.append(model.predict(row))
    validation_results.append(line)
 
df = pd.DataFrame(validation_results)

2. Select the desired evaluator.

In [None]:
from src.evaluators.standard_evaluator import Evaluator

evaluator = Evaluator()

3. Evaluate the models using the predictions.

In [None]:
results = evaluator.evaluate(df)

4. Analyze the results.

In [None]:
print(results)