## 1.1 Install Missed Packages With pip

In [4]:
import sys
import subprocess
import ensurepip

# Ensure pip exists and install required packages if missing
import importlib.util

# 1) Ensure pip is available
try:
    import pip  # noqa: F401
except Exception:
    try:
        ensurepip.bootstrap()
    except Exception:
        # fallback to get-pip.py
        import urllib.request, tempfile, os
        url = "https://bootstrap.pypa.io/get-pip.py"
        fd, tmp_path = tempfile.mkstemp(suffix=".py")
        os.close(fd)
        try:
            urllib.request.urlretrieve(url, tmp_path)
            subprocess.check_call([sys.executable, tmp_path])
        finally:
            try:
                os.remove(tmp_path)
            except Exception:
                pass

# 2) List of packages to ensure (pip-name -> import-name)
required = {
    "pandas": "pandas",
    "numpy": "numpy",
    "matplotlib": "matplotlib",
    "seaborn": "seaborn",
    "scikit-learn": "sklearn",
    "scipy": "scipy",
}

# 3) Detect missing modules
missing = [pkg for pkg, mod in required.items() if importlib.util.find_spec(mod) is None]

# 4) Install missing packages in one pip call
if missing:
    print("Installing missing packages:", missing)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade"] + missing)

# 5) Report installed versions (or failures)
for pkg, mod in required.items():
    try:
        module = __import__(mod)
        version = getattr(module, "__version__", "unknown")
        print(f"{pkg} ({mod}) version: {version}")
    except Exception:
        print(f"Failed to import {mod}")

pandas (pandas) version: 2.3.3
numpy (numpy) version: 1.26.4
matplotlib (matplotlib) version: 3.6.3
seaborn (seaborn) version: 0.13.2
scikit-learn (sklearn) version: 1.7.2
scipy (scipy) version: 1.11.4


In [3]:
# Retry installation with --break-system-packages flag if needed
if missing:
    print("Retrying installation with --break-system-packages...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--break-system-packages"] + missing)

Retrying installation with --break-system-packages...
Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Using cached scikit_learn-1.7.2-cp312-cp

## 1.2 Import Necessary libraries

In [14]:
# Importing the relevant libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from scipy import stats

## 1.3 Helper Functions

In [15]:
def mount_google_drive():
    """
    Checks if running in Google Colab and mounts Google Drive if so.
    Returns the base path to the user's Drive content if successful, otherwise None.
    """
    try:
        from google.colab import drive
        print("Google Colab environment detected. Mounting Google Drive...")
        drive.mount('/content/drive')
        # Return the common base path for Google Drive
        return '/content/drive/MyDrive'
    except ImportError:
        # This will be the case on your local machine
        print("Not running in a Google Colab environment. Skipping Google Drive mount.")
        return None
    except Exception as e:
        print(f"An error occurred while mounting Google Drive: {e}")
        return None

## 2 Loading the dataset

In [16]:
# List to store DataFrames
dfs = []

# --- Define Paths ---
local_data_path = './data'  # The local directory to check first
# IMPORTANT: Update this path to your dataset's location in Google Drive
google_drive_folder = 'CIC-IDS-2017'  # The specific folder name in your MyDrive

def load_data_from_path(path, df_list):
    """Walks a directory and loads all found .csv files into a list of DataFrames."""
    if not (os.path.exists(path) and os.path.isdir(path)):
        print(f"Data directory not found at: {path}")
        return
    
    print(f"Searching for .csv files in: {path}")
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith('.csv'):
                file_path = os.path.join(dirname, filename)
                print(f"  - Reading file: {file_path}")
                try:
                    df_list.append(pd.read_csv(file_path))
                except Exception as e:
                    print(f"    - Error reading {file_path}: {e}")

# --- 1. Attempt to load from local path ---
load_data_from_path(local_data_path, dfs)

# --- 2. If no local data, attempt to load from Google Drive ---
if not dfs:
    print("\nNo local data loaded. Attempting to use Google Drive.")
    drive_base_path = mount_google_drive()
    if drive_base_path:
        # Construct the full path to your data on Google Drive
        gdrive_data_path = os.path.join(drive_base_path, google_drive_folder)
        load_data_from_path(gdrive_data_path, dfs)

# --- 3. Final check ---
if not dfs:
    print("\nWarning: No data could be loaded. The 'dfs' list is empty.")
else:
    print(f"\nSuccessfully loaded {len(dfs)} DataFrame(s).")

Searching for .csv files in: ./data
  - Reading file: ./data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
  - Reading file: ./data/Friday-WorkingHours-Morning.pcap_ISCX.csv
  - Reading file: ./data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
  - Reading file: ./data/Wednesday-workingHours.pcap_ISCX.csv
  - Reading file: ./data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
  - Reading file: ./data/Monday-WorkingHours.pcap_ISCX.csv
  - Reading file: ./data/Tuesday-WorkingHours.pcap_ISCX.csv
  - Reading file: ./data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv

Successfully loaded 8 DataFrame(s).


In [17]:
# Data dimensions of each individual dataset
for i, data in enumerate(dfs, start=1):
    rows, cols = data.shape
    print(f'df{i} -> {rows} rows, {cols} columns')

df1 -> 288602 rows, 79 columns
df2 -> 191033 rows, 79 columns
df3 -> 170366 rows, 79 columns
df4 -> 692703 rows, 79 columns
df5 -> 225745 rows, 79 columns
df6 -> 529918 rows, 79 columns
df7 -> 445909 rows, 79 columns
df8 -> 286467 rows, 79 columns


## 1.2. Merging the DataFrames

In [18]:
# It's good practice to check if the list is populated before concatenation.
if dfs:
    # Concatenate all DataFrames into a single DataFrame
    data = pd.concat(dfs, axis=0, ignore_index=True)
    print("DataFrames concatenated successfully.")

    # Deleting the list of DataFrames to free up memory
    del dfs
else:
    # Handle the case where no DataFrames were created
    print("Warning: No data was found to concatenate. The 'data' DataFrame has not be created.")
    data = pd.DataFrame() # Optionally, create an empty DataFrame


DataFrames concatenated successfully.


In [19]:
# Display the first few rows
data.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,166,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,60148,83,1,2,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,123,99947,1,1,48,48,48,48,48.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,123,37017,1,1,48,48,48,48,48.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,0,111161336,147,0,0,0,0,0,0.0,0.0,...,0,1753752.625,2123197.578,4822992,95,9463032.7,2657727.996,13600000,5700287,BENIGN


In [20]:
# Display 10 random rows
data.sample(n=10, random_state=42)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
746827,80,217872,3,7,382,11595,382,0,127.333333,220.547803,...,32,0.0,0.0,0,0,0.0,0.0,0,0,DoS Hulk
946912,80,84743121,5,7,355,11595,343,0,71.0,152.082215,...,20,11013.0,0.0,11013,11013,84600000.0,0.0,84600000,84600000,DoS Hulk
2216843,53,2714807,6,4,258,416,45,42,43.0,1.549193,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
699389,443,11053475,21,22,18983,8109,2389,0,903.952381,1125.348945,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1170268,80,15220,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
800686,80,99394898,8,6,339,11595,333,0,42.375,117.449001,...,20,1945.0,0.0,1945,1945,99200000.0,0.0,99200000,99200000,DoS Hulk
1434488,80,109263,3,4,26,11601,20,0,8.666667,10.263203,...,20,0.0,0.0,0,0,0.0,0.0,0,0,DDoS
1968368,53,69709,4,4,168,768,42,42,42.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
934343,80,83027396,10,6,311,11595,311,0,31.1,98.346835,...,32,1984.0,0.0,1984,1984,82900000.0,0.0,82900000,82900000,DoS Hulk
693547,80,61462028,5,5,231,4559,213,0,46.2,93.280223,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [21]:
# Get dataset dimensions
print(f"Dataset Dimensions: {data.shape}")

Dataset Dimensions: (2830743, 79)


In [22]:
# Display data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 79 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0    Destination Port             int64  
 1    Flow Duration                int64  
 2    Total Fwd Packets            int64  
 3    Total Backward Packets       int64  
 4   Total Length of Fwd Packets   int64  
 5    Total Length of Bwd Packets  int64  
 6    Fwd Packet Length Max        int64  
 7    Fwd Packet Length Min        int64  
 8    Fwd Packet Length Mean       float64
 9    Fwd Packet Length Std        float64
 10  Bwd Packet Length Max         int64  
 11   Bwd Packet Length Min        int64  
 12   Bwd Packet Length Mean       float64
 13   Bwd Packet Length Std        float64
 14  Flow Bytes/s                  float64
 15   Flow Packets/s               float64
 16   Flow IAT Mean                float64
 17   Flow IAT Std                 float64
 18   Flow IAT Max         

By checking the data types, it is safe to assume the dataset countains only metric features, considering 'Label' as 'y' (the only categorical column).

In [23]:
# Checking for missing values
missing_values = data.isna().sum()
missing_percentage = (missing_values / len(data)) * 100

# Printing columns with missing values
for column, count in missing_values.items():
    if count != 0:
        print(f"Column '{column}' has {count} missing values, which is {missing_percentage[column]:.2f}% of the total")

Column 'Flow Bytes/s' has 1358 missing values, which is 0.05% of the total


In [24]:
# Checking and counting duplicates
duplicates = data.duplicated()
duplicate_count = duplicates.sum()

# Output results
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 308381


## 3. Data Cleaning

In [25]:
# Removal of leading/trailing whitespace
col_names = {col: col.strip() for col in data.columns}
data.rename(columns = col_names, inplace = True)