In [13]:
# Core data manipulation and analysis libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations and arrays

# Visualization libraries
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization

# Machine learning libraries
from sklearn import metrics  # For model evaluation metrics
from sklearn.model_selection import train_test_split  # For splitting datasets

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Enable inline plotting in Jupyter notebooks
# Fixed duplicate import and invalid syntax
%matplotlib inline

In [49]:
    
# Reading datasets
# Using list comprehension to read all csv files in 4 csv files
dataframes = [pd.read_csv(f'UNSW-NB15_{i}.csv', header=None) 
       for i in range(1,5)]
    
# Concat all to a single df name combined_data
# Resetting index is important because:
# 1. When concatenating dataframes, the original index values are preserved which can lead to duplicate indices
# 2. Duplicate indices can cause issues with data access, filtering and analysis
# 3. reset_index() creates a clean, sequential index starting from 0
combined_data = pd.concat(dataframes).reset_index(drop=True) 

In [50]:
# Converts null labels to normal.
feature_names = pd.read_csv('features2.csv' )
feature_names_list = feature_names['Name'].tolist()
combined_data.columns = feature_names_list
combined_data.loc[combined_data['attack_cat'].isnull(), 'attack_cat'] = 'Normal'

In [74]:
# Create dataframe of just sport and attack cat columns for function.
df1 = combined_data[['sport', 'attack_cat']]

In [31]:
# Label correction.
print(df1['attack_cat'].value_counts())

attack_cat
Normal              2218764
Generic              215481
Exploits              44525
 Fuzzers              19195
DoS                   16353
 Reconnaissance       12228
 Fuzzers               5051
Analysis               2677
Backdoor               1795
Reconnaissance         1759
 Shellcode             1288
Backdoors               534
Shellcode               223
Worms                   174
Name: count, dtype: int64


In [75]:
df1['attack_cat'] = df1['attack_cat'].str.strip()

In [33]:
print(df1['attack_cat'].value_counts())

attack_cat
Normal            2218764
Generic            215481
Exploits            44525
Fuzzers             24246
DoS                 16353
Reconnaissance      13987
Analysis             2677
Backdoor             1795
Shellcode            1511
Backdoors             534
Worms                 174
Name: count, dtype: int64


In [None]:
print(df1['sport'].value_counts())


sport
1043     216289
47439    198580
0         50432
47439      4689
1043       4033
          ...  
33785         1
2637          1
29900         1
3664          1
706           1
Name: count, Length: 100341, dtype: int64


In [76]:
# Get a smaller dataframe to make function.
df1 = df1.head(1000000)

In [77]:
# Check number of unique categories in sport (reduced data).
sports = df1['sport'].unique()
print(len(sports))

100287


In [78]:
print(df1['sport'].value_counts())

sport
0        8910
47439    4689
1043     4033
0        3612
47439    2250
         ... 
49266       1
39428       1
3044        1
62086       1
27909       1
Name: count, Length: 100287, dtype: int64


In [79]:
# Get only values that appear more than 30 times.
sports = df1['sport'].value_counts()[df1['sport'].value_counts() > 30]

In [80]:
print(len(sports))

556


In [81]:
# The label we want to compare.
label = 'Normal'
# Dataframe to add the correlations.
corr_df = pd.DataFrame(index=df1.index)
corr_list = []
# Go through each unique category in the given column (tested on just sport but can make it a function).
for s in sports:
    # Create column name same as encoding conventions.
    col_name = f'sport_{s}'
    # Convert all values that match to 1 or else convert to 0 for given values in each column (sport is an iteration and label is static).
    corr_df[col_name] = (df1['sport'] == s).astype(int)
    attack_cat = (df1['attack_cat'] == label).astype(int)
    # Draw correlations of the converted binary columns.
    correlation = corr_df[col_name].corr(attack_cat)
    # Add correlations to list for Dataframe.
    corr_list.append({
        'Sport': s,
        'Correlation': correlation
    })
corr_table = pd.DataFrame(corr_list)
# Get the absolute correlation and sort in descending order.
corr_table['Correlation'] = corr_table['Correlation'].abs()
corr_table.sort_values(by='Correlation', ascending=False, inplace=True)
print("\nCorrelation in Sport Column:")
print(corr_table)
print(corr_table.tail())
print(df1['sport'].value_counts())


Correlation in Sport Column:
     Sport  Correlation
16      68     0.016251
17      68     0.016251
2     4033     0.001546
5     1930     0.000692
3     3612     0.000674
..     ...          ...
551     31          NaN
552     31          NaN
553     31          NaN
554     31          NaN
555     31          NaN

[556 rows x 2 columns]
     Sport  Correlation
551     31          NaN
552     31          NaN
553     31          NaN
554     31          NaN
555     31          NaN
sport
0        8910
47439    4689
1043     4033
0        3612
47439    2250
         ... 
49266       1
39428       1
3044        1
62086       1
27909       1
Name: count, Length: 100287, dtype: int64
