# Jupyter notebook sample

In [None]:
import csv

import warnings
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

### Import data

# Read data from a csv file

In [None]:
# Defining data for the dataframe
# download from https://www.kaggle.com/datasets/supplejade/bccc-cira-cic-dohbrw-2020-dns-over-http/download?datasetVersionNumber=2
# and unzip it to data folder

df = pd.read_csv('data/BCCC-CIRA-CIC-DoHBrw-2020.csv', na_values=['NA'])

def add_space(s):
    a = s[0]
    for i in s[1:]:
        if i.isupper():
            a += ' '
        a += i
    return a

df.columns = [add_space(i.replace(' ', '')) for i in df.columns]

df.describe()

## Data visualization
#### Histogram of the PCA data


In [1]:
df_subset = df.iloc[:1000, :]
df_subset.replace([np.inf, -np.inf], np.nan)

# 28 columns
fig = plt.figure(figsize=(30, 30))

for i in df_subset.columns[:-1]:
    plt.subplot(4,7, list(df_subset.columns).index(i) + 1)
    sb.kdeplot(data=df_subset, x=i, hue='Label', fill=True)
    
plt.show()
    

NameError: name 'df' is not defined

### Create a PairPlot for each parameter
This will save the graphs to the plots path

In [None]:
palette = sb.color_palette("tab10", n_colors=2)  # Choose a suitable palette
colormap = sb.color_palette("tab10", as_cmap=True)  # Get the colormap

# Convert the colormap to a list of colors
colors = [colormap(i) for i in range(len(palette))]

for i in df_subset.columns[:-1]:
    fig = plt.figure(figsize=(16, 7*3+1))
    plt.title(i)
    
    for j in df_subset.columns[:-1]:
        
        plt.subplot(7,4, list(df_subset.columns).index(j) + 1)
        if i == j:
            sb.kdeplot(data=df_subset, x=i, hue='Label', fill=True, palette=colors)
        else:
            sb.scatterplot(data=df_subset, x=i, y=j, hue='Label', palette=colors)
    
    print(f'Done with {i}')
    plt.tight_layout()
    
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.figlegend(handles, labels, loc='lower center', ncol=len(colors), title='Label', bbox_to_anchor=(0.5, -0.02))
    plt.tight_layout()
    
    plt.savefig(f'plots/{i}.png')
    plt.close()

In [None]:
data_pca = pd.read_csv("data/data_pca.csv")

### Histogram of the PCA data

In [None]:
g = sb.PairGrid(data_pca.iloc[:1000][:], hue="Label")
g.map_diag(plt.hist)
g.map_lower(plt.scatter)
g.map_upper(sb.kdeplot)

g.add_legend()

for ax in g.axes.flatten():
    # rotate x axis labels
    ax.set_xlabel(ax.get_xlabel(), rotation = 90)
    # rotate y axis labels
    ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    # set y labels alignment
    ax.yaxis.get_label().set_horizontalalignment('right')

plt.show()

### Correlation Matrix

In [None]:
X = df.iloc[:, :28]
y = df.iloc[:, 28]

correlation_matrix = X.corr()

plt.figure(figsize=(10, 8))
sb.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()