In [4]:
# Core data manipulation and analysis libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations and arrays

# Visualization libraries
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization

# Machine learning libraries
from sklearn import metrics  # For model evaluation metrics
from sklearn.model_selection import train_test_split  # For splitting datasets

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Enable inline plotting in Jupyter notebooks
# Fixed duplicate import and invalid syntax
%matplotlib inline

In [6]:
    
# Reading datasets
# Using list comprehension to read all csv files in 4 csv files
dataframes = [pd.read_csv(f'UNSW-NB15_{i}.csv', header=None) 
       for i in range(1,5)]
    
# Concat all to a single df name combined_data
# Resetting index is important because:
# 1. When concatenating dataframes, the original index values are preserved which can lead to duplicate indices
# 2. Duplicate indices can cause issues with data access, filtering and analysis
# 3. reset_index() creates a clean, sequential index starting from 0
combined_data = pd.concat(dataframes).reset_index(drop=True) 

In [7]:
# Converts null labels to normal.
feature_names = pd.read_csv('features2.csv' )
feature_names_list = feature_names['Name'].tolist()
combined_data.columns = feature_names_list
combined_data.loc[combined_data['attack_cat'].isnull(), 'attack_cat'] = 'Normal'

In [8]:
# Create dataframe of just sport and attack cat columns for function.
df1 = combined_data[['sport', 'attack_cat']]

In [9]:
# Label correction.
print(df1['attack_cat'].value_counts())

attack_cat
Normal              2218764
Generic              215481
Exploits              44525
 Fuzzers              19195
DoS                   16353
 Reconnaissance       12228
 Fuzzers               5051
Analysis               2677
Backdoor               1795
Reconnaissance         1759
 Shellcode             1288
Backdoors               534
Shellcode               223
Worms                   174
Name: count, dtype: int64


In [10]:
df1['attack_cat'] = df1['attack_cat'].str.strip()

In [11]:
print(df1['attack_cat'].value_counts())

attack_cat
Normal            2218764
Generic            215481
Exploits            44525
Fuzzers             24246
DoS                 16353
Reconnaissance      13987
Analysis             2677
Backdoor             1795
Shellcode            1511
Backdoors             534
Worms                 174
Name: count, dtype: int64


In [12]:
print(df1['sport'].value_counts())


sport
1043     216289
47439    198580
0         50432
47439      4689
1043       4033
          ...  
33785         1
2637          1
29900         1
3664          1
706           1
Name: count, Length: 100341, dtype: int64


In [13]:
# Get a smaller dataframe to make function.
df1 = df1.head(1000000)

In [14]:
# Check number of unique categories in sport (reduced data).
sports = df1['sport'].unique()
print(len(sports))

100287


In [15]:
print(df1['sport'].value_counts())

sport
0        8910
47439    4689
1043     4033
0        3612
47439    2250
         ... 
49266       1
39428       1
3044        1
62086       1
27909       1
Name: count, Length: 100287, dtype: int64


In [None]:
sport_value_counts = df1['sport'].value_counts()
# Filter values that appear more than 30 times
sports = sport_value_counts[sport_value_counts > 30].index.tolist()

In [23]:
print(len(sports))
print(sports)

556
[0, '47439', '1043', '0', 47439, 1043, 80, 53, 5190, 6881, 25, 143, 21, 111, 56724, 22, 59084, 31591, 1336, 1916, 15235, 6027, 49320, 7907, 36123, 1419, 30705, 30969, 1087, 1097, 46343, 43903, 1695, 1652, 47535, 1209, 55357, 38505, 1756, 61653, 50824, 35000, 47291, 40662, 58924, 1103, 2013, 1777, 1230, 1116, 48691, 8570, 21443, 1636, 1630, 46406, 1642, 1895, 35547, 1365, 1546, 1722, 1643, 20881, 1587, 1398, 1920, 31614, 1190, 39102, 1044, 1379, 13829, 1110, 37406, 1039, 1981, 1828, 1243, 1144, 33095, 1716, 27152, 52400, 1276, 1195, 5060, 1735, 1273, 1574, 1372, 1187, 1072, 1283, 65507, 1936, 1708, 1833, 61667, 1835, 1987, 42635, 1898, 1189, 1570, 1096, 1660, 1359, 50447, 1729, 2048, 1842, 1078, 34804, 1935, 1927, 10056, 1499, 27720, 2041, 30493, 1605, 1580, 7418, 1540, 1517, 9305, 1261, 1813, 11102, 1718, 15801, 1200, 31383, 1794, 259, 1650, 1886, 1298, 1934, 1669, 27312, 46650, 1309, 1744, 1356, 1769, 2024, '5060', 1322, 1170, 41750, 3793, 1148, 1874, 1377, 38409, 48837, 1295, 199

In [None]:
# The label we want to compare.
label = 'Normal'
# Dataframe to add the correlations.
corr_df = pd.DataFrame(index=df1.index)
corr_list = []
# Go through each unique category in the given column (tested on just sport but can make it a function).
for s in sports:
    # Create column name same as encoding conventions.
    col_name = f'sport_{s}'
    # Convert all values that match to 1 or else convert to 0 for given values in each column (sport is an iteration and label is static).
    corr_df[col_name] = (df1['sport'] == s).astype(int)
    attack_cat = (df1['attack_cat'] == label).astype(int)
    # Draw correlations of the converted binary columns.
    correlation = corr_df[col_name].corr(attack_cat)
    # Add correlations to list for Dataframe.
    corr_list.append({
        'Sport': s,
        'Correlation': correlation
    })
corr_table = pd.DataFrame(corr_list)
# Get the absolute correlation and sort in descending order.
corr_table['Correlation'] = corr_table['Correlation'].abs()
corr_table.sort_values(by='Correlation', ascending=False, inplace=True)
print("\nCorrelation in Sport Column:")
print(corr_table)
print(corr_table.tail())
print(df1['sport'].value_counts())


Correlation in Sport Column:
    Sport  Correlation
3       0     0.343122
1   47439     0.255831
2    1043     0.205273
4   47439     0.176715
5    1043     0.140546
..    ...          ...
86   5060     0.000095
63  20881     0.000046
51   8570     0.000023
49   1116     0.000023
41  35000     0.000022

[556 rows x 2 columns]
    Sport  Correlation
86   5060     0.000095
63  20881     0.000046
51   8570     0.000023
49   1116     0.000023
41  35000     0.000022
sport
0        8910
47439    4689
1043     4033
0        3612
47439    2250
         ... 
49266       1
39428       1
3044        1
62086       1
27909       1
Name: count, Length: 100287, dtype: int64
