In [4]:
# Core data manipulation and analysis libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations and arrays

# Visualization libraries
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization

# Machine learning libraries
from sklearn import metrics  # For model evaluation metrics
from sklearn.model_selection import train_test_split  # For splitting datasets

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Enable inline plotting in Jupyter notebooks
# Fixed duplicate import and invalid syntax
%matplotlib inline

In [6]:
    
# Reading datasets
# Using list comprehension to read all csv files in 4 csv files
dataframes = [pd.read_csv(f'UNSW-NB15_{i}.csv', header=None) 
       for i in range(1,5)]
    
# Concat all to a single df name combined_data
# Resetting index is important because:
# 1. When concatenating dataframes, the original index values are preserved which can lead to duplicate indices
# 2. Duplicate indices can cause issues with data access, filtering and analysis
# 3. reset_index() creates a clean, sequential index starting from 0
combined_data = pd.concat(dataframes).reset_index(drop=True) 

In [7]:
# Converts null labels to normal.
feature_names = pd.read_csv('features2.csv' )
feature_names_list = feature_names['Name'].tolist()
combined_data.columns = feature_names_list
combined_data.loc[combined_data['attack_cat'].isnull(), 'attack_cat'] = 'Normal'

In [8]:
# Create dataframe of just sport and attack cat columns for function.
df1 = combined_data[['sport', 'attack_cat']]

In [9]:
# Label correction.
print(df1['attack_cat'].value_counts())

attack_cat
Normal              2218764
Generic              215481
Exploits              44525
 Fuzzers              19195
DoS                   16353
 Reconnaissance       12228
 Fuzzers               5051
Analysis               2677
Backdoor               1795
Reconnaissance         1759
 Shellcode             1288
Backdoors               534
Shellcode               223
Worms                   174
Name: count, dtype: int64


In [10]:
df1['attack_cat'] = df1['attack_cat'].str.strip()

In [11]:
print(df1['attack_cat'].value_counts())

attack_cat
Normal            2218764
Generic            215481
Exploits            44525
Fuzzers             24246
DoS                 16353
Reconnaissance      13987
Analysis             2677
Backdoor             1795
Shellcode            1511
Backdoors             534
Worms                 174
Name: count, dtype: int64


In [12]:
print(df1['sport'].value_counts())


sport
1043     216289
47439    198580
0         50432
47439      4689
1043       4033
          ...  
33785         1
2637          1
29900         1
3664          1
706           1
Name: count, Length: 100341, dtype: int64


In [13]:
# Get a smaller dataframe to make function.
df1 = df1.head(1000000)

In [14]:
# Check number of unique categories in sport (reduced data).
sports = df1['sport'].unique()
print(len(sports))

100287


In [15]:
print(df1['sport'].value_counts())

sport
0        8910
47439    4689
1043     4033
0        3612
47439    2250
         ... 
49266       1
39428       1
3044        1
62086       1
27909       1
Name: count, Length: 100287, dtype: int64


In [25]:
df1['sport'] = df1['sport'].astype(str)
sport_value_counts = df1['sport'].value_counts()
# Filter values that appear more than 30 times
sports = sport_value_counts[sport_value_counts > 30].index.tolist()

In [26]:
print(len(sports))
print(sports)

710
['0', '47439', '1043', '80', '53', '5190', '6881', '25', '143', '21', '111', '22', '5060', '56724', '49320', '31591', '1336', '59084', '1916', '15235', '6027', '7907', '19971', '36123', '2048', '259', '1419', '1695', '30705', '30969', '1097', '1087', '1209', '46343', '1756', '47535', '1777', '1652', '55357', '50824', '61653', '38505', '43903', '1895', '47291', '35000', '1103', '46406', '58924', '40662', '48691', '21443', '8570', '1365', '1273', '2013', '1190', '1116', '1920', '21561', '1230', '1716', '1546', '1605', '1630', '31614', '1642', '1072', '1643', '1144', '1650', '1110', '50447', '1722', '1636', '1243', '1828', '1499', '1981', '1570', '1708', '1398', '35547', '10056', '1813', '1039', '27152', '1379', '39102', '20881', '27720', '1587', '37406', '1283', '1372', '1574', '13829', '1044', '1935', '1309', '1540', '1359', '1517', '52400', '1276', '1735', '1794', '1261', '1715', '33095', '1835', '1729', '1189', '1096', '1833', '1187', '1718', '1987', '1024', '1195', '42635', '1997

In [27]:
# The label we want to compare.
label = 'Normal'
# Dataframe to add the correlations.
corr_df = pd.DataFrame(index=df1.index)
corr_list = []
# Go through each unique category in the given column (tested on just sport but can make it a function).
for s in sports:
    # Create column name same as encoding conventions.
    col_name = f'sport_{s}'
    # Convert all values that match to 1 or else convert to 0 for given values in each column (sport is an iteration and label is static).
    corr_df[col_name] = (df1['sport'] == s).astype(int)
    attack_cat = (df1['attack_cat'] == label).astype(int)
    # Draw correlations of the converted binary columns.
    correlation = corr_df[col_name].corr(attack_cat)
    # Add correlations to list for Dataframe.
    corr_list.append({
        'Sport': s,
        'Correlation': correlation
    })
corr_table = pd.DataFrame(corr_list)
# Get the absolute correlation and sort in descending order.
corr_table['Correlation'] = corr_table['Correlation'].abs()
corr_table.sort_values(by='Correlation', ascending=False, inplace=True)
print("\nCorrelation in Sport Column:")
print(corr_table)
print(corr_table.tail())
print(df1['sport'].value_counts())


Correlation in Sport Column:
     Sport   Correlation
1    47439  3.114058e-01
2     1043  2.491009e-01
0        0  2.073268e-01
164  56237  2.513476e-02
349  60986  2.089951e-02
..     ...           ...
89   20881  4.630969e-05
45   35000  2.189939e-05
47   46406  2.189939e-05
59   21561  3.287320e-07
57    1116  3.287320e-07

[710 rows x 2 columns]
    Sport   Correlation
89  20881  4.630969e-05
45  35000  2.189939e-05
47  46406  2.189939e-05
59  21561  3.287320e-07
57   1116  3.287320e-07
sport
0        12522
47439     6939
1043      5963
80         348
53         194
         ...  
225          1
867          1
281          1
862          1
851          1
Name: count, Length: 64548, dtype: int64
