In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# Load the cleaned CICIDS2017 dataset
# Path is relative to the notebooks/ folder
df = pd.read_csv('../data/raw/cicids2017_cleaned.csv')

print(f"Dataset loaded successfully!")
print(f"\nDataset Shape: {df.shape}")
print(f"Total Rows: {df.shape[0]:,}")
print(f"Total Columns: {df.shape[1]}")

Dataset loaded successfully!

Dataset Shape: (2520751, 53)
Total Rows: 2,520,751
Total Columns: 53


In [5]:
# Display all column names
print("Column Names:")
print("="*50)
print(df.columns.tolist())

Column Names:
['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'Average Packet Size', 'Subflow Fwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Max', 'Active Min', 'Idle Mean', '

In [6]:
# Check data types and missing values
print("Data Types and Missing Values:")
print("="*50)

info_df = pd.DataFrame({
    'Column': df.columns,
    'Data Type': df.dtypes.values,
    'Missing Values': df.isnull().sum().values,
    'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
})

print(info_df.to_string(index=False))
print(f"\nTotal Missing Values: {df.isnull().sum().sum()}")

Data Types and Missing Values:
                     Column Data Type  Missing Values  Missing %
           Destination Port     int64               0        0.0
              Flow Duration     int64               0        0.0
          Total Fwd Packets     int64               0        0.0
Total Length of Fwd Packets     int64               0        0.0
      Fwd Packet Length Max     int64               0        0.0
      Fwd Packet Length Min     int64               0        0.0
     Fwd Packet Length Mean   float64               0        0.0
      Fwd Packet Length Std   float64               0        0.0
      Bwd Packet Length Max     int64               0        0.0
      Bwd Packet Length Min     int64               0        0.0
     Bwd Packet Length Mean   float64               0        0.0
      Bwd Packet Length Std   float64               0        0.0
               Flow Bytes/s   float64               0        0.0
             Flow Packets/s   float64               0      

In [9]:
# DIAGNOSTIC: Find columns that might contain labels
print("Searching for potential label columns...")
print("="*50)

# Check last 5 columns (labels usually at the end)
print("\nLast 5 columns in dataset:")
print(df.columns[-5:].tolist())

# Check for common label column names
possible_label_cols = ['Label', 'label', 'Label ', ' Label', 'class', 'target', 'attack_cat', 'attack_type']

print("\nChecking for common label column names:")
for col in possible_label_cols:
    if col in df.columns:
        print(f"✓ Found: '{col}'")
        print(f"  Sample values: {df[col].unique()[:5]}")
        print(f"  Total unique values: {df[col].nunique()}")
        
# Also check if any column contains only a few unique values (likely categorical/label)
print("\nColumns with less than 20 unique values (potential labels):")
for col in df.columns:
    if df[col].nunique() < 20:
        print(f"  - {col}: {df[col].nunique()} unique values")

Searching for potential label columns...

Last 5 columns in dataset:
['Active Min', 'Idle Mean', 'Idle Max', 'Idle Min', 'Attack Type']

Checking for common label column names:

Columns with less than 20 unique values (potential labels):
  - FIN Flag Count: 2 unique values
  - PSH Flag Count: 2 unique values
  - ACK Flag Count: 2 unique values
  - Attack Type: 7 unique values


In [11]:
# Check unique labels (attack types)
print("Label Distribution:")
print("="*50)

label_counts = df['Attack Type'].value_counts()
print(label_counts)

print("\n" + "="*50)
print(f"Number of unique labels: {df['Attack Type'].nunique()}")
print(f"Label names: {df['Attack Type'].unique()}")

Label Distribution:
Attack Type
Normal Traffic    2095057
DoS                193745
DDoS               128014
Port Scanning       90694
Brute Force          9150
Web Attacks          2143
Bots                 1948
Name: count, dtype: int64

Number of unique labels: 7
Label names: ['Normal Traffic' 'Port Scanning' 'Web Attacks' 'Brute Force' 'DDoS'
 'Bots' 'DoS']


In [16]:
# Create binary labels (Benign vs Attack)
# First, let's check what the benign label is called
print("Unique Attack Types:")
print(df['Attack Type'].unique())
print("\n" + "="*50)

# Create binary label: 0 for Benign, 1 for Attack
# Adjust 'BENIGN' if your dataset uses different naming (like 'Benign' or 'Normal')
df['Binary_Label'] = df['Attack Type'].apply(lambda x: 0 if x == 'Normal Traffic' else 1)

print("Binary Label Distribution:")
print("="*50)
print(df['Binary_Label'].value_counts())
print("\n" + "="*50)
print(f"Normal Traffic (0): {(df['Binary_Label'] == 0).sum():,} ({(df['Binary_Label'] == 0).sum()/len(df)*100:.2f}%)")
print(f"Attack Traffic (1): {(df['Binary_Label'] == 1).sum():,} ({(df['Binary_Label'] == 1).sum()/len(df)*100:.2f}%)")

# Verify the split
print("\n" + "="*50)
benign_percentage = (df['Binary_Label'] == 0).sum()/len(df)*100
attack_percentage = (df['Binary_Label'] == 1).sum()/len(df)*100

if attack_percentage > 0 and benign_percentage > 0:
    print("✓ Binary labels created successfully!")
    print(f"  Class imbalance ratio: 1:{attack_percentage/benign_percentage:.2f} (Benign:Attack)")
else:
    print("⚠ Warning: Check your label column - one class might be empty!")

Unique Attack Types:
['Normal Traffic' 'Port Scanning' 'Web Attacks' 'Brute Force' 'DDoS'
 'Bots' 'DoS']

Binary Label Distribution:
Binary_Label
0    2095057
1     425694
Name: count, dtype: int64

Normal Traffic (0): 2,095,057 (83.11%)
Attack Traffic (1): 425,694 (16.89%)

✓ Binary labels created successfully!
  Class imbalance ratio: 1:0.20 (Benign:Attack)


In [17]:
# Display basic statistics for numerical columns
print("Statistical Summary (first 5 numerical columns as sample):")
print("="*50)

# Get only numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Total numerical features: {len(numerical_cols)}")
print("\nSample statistics:")
print(df[numerical_cols[:5]].describe())

Statistical Summary (first 5 numerical columns as sample):
Total numerical features: 53

Sample statistics:
       Destination Port  Flow Duration  Total Fwd Packets  \
count      2.520751e+06   2.520751e+06       2.520751e+06   
mean       8.690744e+03   1.659032e+07       1.025880e+01   
std        1.901294e+04   3.523140e+07       7.943824e+02   
min        0.000000e+00  -1.300000e+01       1.000000e+00   
25%        5.300000e+01   2.080000e+02       2.000000e+00   
50%        8.000000e+01   5.062000e+04       2.000000e+00   
75%        4.430000e+02   5.332968e+06       6.000000e+00   
max        6.553500e+04   1.200000e+08       2.197590e+05   

       Total Length of Fwd Packets  Fwd Packet Length Max  
count                 2.520751e+06           2.520751e+06  
mean                  6.065243e+02           2.311993e+02  
std                   1.011595e+04           7.563122e+02  
min                   0.000000e+00           0.000000e+00  
25%                   1.200000e+01        

In [18]:
# Check for infinite values in numerical columns
print("Checking for Infinite Values:")
print("="*50)

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

inf_counts = {}
for col in numerical_cols:
    inf_count = np.isinf(df[col]).sum()
    if inf_count > 0:
        inf_counts[col] = inf_count

if inf_counts:
    print(f"Found {len(inf_counts)} columns with infinite values:")
    for col, count in inf_counts.items():
        print(f"  - {col}: {count} infinite values")
else:
    print("✓ No infinite values found!")
    
print(f"\nTotal infinite values in dataset: {sum(inf_counts.values()) if inf_counts else 0}")

Checking for Infinite Values:
✓ No infinite values found!

Total infinite values in dataset: 0


In [19]:
# Display first few rows with the new binary label
print("Sample Data (first 5 rows):")
print("="*50)

# Show original Attack Type and new Binary_Label together
cols_to_display = ['Attack Type', 'Binary_Label'] + df.columns[:5].tolist()
print(df[cols_to_display].head())

print("\n" + "="*50)
print("Sample Data (last 5 rows):")
print(df[cols_to_display].tail())

Sample Data (first 5 rows):
      Attack Type  Binary_Label  Destination Port  Flow Duration  \
0  Normal Traffic             0                22        1266342   
1  Normal Traffic             0                22        1319353   
2  Normal Traffic             0                22            160   
3  Normal Traffic             0                22        1303488   
4  Normal Traffic             0             35396             77   

   Total Fwd Packets  Total Length of Fwd Packets  Fwd Packet Length Max  
0                 41                         2664                    456  
1                 41                         2664                    456  
2                  1                            0                      0  
3                 41                         2728                    456  
4                  1                            0                      0  

Sample Data (last 5 rows):
            Attack Type  Binary_Label  Destination Port  Flow Duration  \
2520746  No

In [20]:
# Summarize key findings from exploration
print("="*60)
print("DATA EXPLORATION SUMMARY")
print("="*60)

print(f"\n1. DATASET SIZE:")
print(f"   - Total instances: {len(df):,}")
print(f"   - Total features: {len(df.columns)}")
print(f"   - Numerical features: {len(df.select_dtypes(include=[np.number]).columns)}")

print(f"\n2. TARGET VARIABLE (Attack Type):")
print(f"   - Column name: 'Attack Type'")
print(f"   - Number of unique attack types: {df['Attack Type'].nunique()}")

print(f"\n3. BINARY CLASSIFICATION:")
print(f"   - Benign: {(df['Binary_Label']==0).sum():,} ({(df['Binary_Label']==0).sum()/len(df)*100:.2f}%)")
print(f"   - Attack: {(df['Binary_Label']==1).sum():,} ({(df['Binary_Label']==1).sum()/len(df)*100:.2f}%)")

print(f"\n4. DATA QUALITY:")
print(f"   - Missing values: {df.isnull().sum().sum()}")
inf_count = sum(np.isinf(df.select_dtypes(include=[np.number])).sum())
print(f"   - Infinite values: {inf_count}")

print("\n" + "="*60)
print("Next Step: Data Preprocessing & Feature Engineering")
print("="*60)

DATA EXPLORATION SUMMARY

1. DATASET SIZE:
   - Total instances: 2,520,751
   - Total features: 54
   - Numerical features: 53

2. TARGET VARIABLE (Attack Type):
   - Column name: 'Attack Type'
   - Number of unique attack types: 7

3. BINARY CLASSIFICATION:
   - Benign: 2,095,057 (83.11%)
   - Attack: 425,694 (16.89%)

4. DATA QUALITY:
   - Missing values: 0
   - Infinite values: 0

Next Step: Data Preprocessing & Feature Engineering
