# Jupyter notebook sample

In [None]:
import csv

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Import data

# Read data from a csv file

In [None]:
# Defining data for the dataframe
# download from https://www.kaggle.com/datasets/supplejade/bccc-cira-cic-dohbrw-2020-dns-over-http/download?datasetVersionNumber=2
# and unzip it to data folder

df = pd.read_csv('data/BCCC-CIRA-CIC-DoHBrw-2020.csv', na_values=['NA'])

def add_space(s):
    a = s[0]
    for i in s[1:]:
        if i.isupper():
            a += ' '
        a += i
    return a

df.columns = [add_space(i.replace(' ', '')) for i in df.columns]

df.describe()

### Data visualization



In [None]:
df_subset = df.iloc[:1000, :]
df_subset.replace([np.inf, -np.inf], np.nan)

# 28 columns

fig = plt.figure(figsize=(30, 30))

for i in df_subset.columns[:-1]:
    plt.subplot(4,7, list(df_subset.columns).index(i) + 1)
    sb.kdeplot(data=df_subset, x=i, hue='Label', fill=True)


plt.show()
    

In [None]:
palette = sb.color_palette("tab10", n_colors=2)  # Choose a suitable palette
colormap = sb.color_palette("tab10", as_cmap=True)  # Get the colormap

# Convert the colormap to a list of colors
colors = [colormap(i) for i in range(len(palette))]

for i in df_subset.columns[:-1]:
    fig = plt.figure(figsize=(16, 7*3+1))
    plt.title(i)
    
    for j in df_subset.columns[:-1]:
        
        plt.subplot(7,4, list(df_subset.columns).index(j) + 1)
        if i == j:
            sb.kdeplot(data=df_subset, x=i, hue='Label', fill=True, palette=colors)
        else:
            sb.scatterplot(data=df_subset, x=i, y=j, hue='Label', palette=colors)
    
    print(f'Done with {i}')
    plt.tight_layout()
    
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.figlegend(handles, labels, loc='lower center', ncol=len(colors), title='Label', bbox_to_anchor=(0.5, -0.02))
    plt.tight_layout()
    
    plt.savefig(f'plots/{i}.png')
    plt.close()

In [None]:
X = df.drop('Label', axis=1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)

pca = PCA(n_components=9)
pca.fit(scaled_data)

pca_data = pca.transform(scaled_data)

pca_df = pd.DataFrame(pca_data, columns=[f"PC{i}" for i in range(1, pca_data.shape[1] + 1)])

data_pca = pd.concat([pca_df, df['Label']], axis=1)

data_pca.to_csv('data/data_pca.csv')

data_pca

In [None]:
g = sb.PairGrid(data_pca.iloc[:1000][:], hue="Label")
g.map_diag(plt.hist)
g.map_lower(plt.scatter)
g.map_upper(sb.kdeplot)

g.add_legend()

for ax in g.axes.flatten():
    # rotate x axis labels
    ax.set_xlabel(ax.get_xlabel(), rotation = 90)
    # rotate y axis labels
    ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    # set y labels alignment
    ax.yaxis.get_label().set_horizontalalignment('right')

plt.show()

In [None]:
# SVD
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your pandas DataFrame
# Separate features (X) and labels (y)
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Labels

# Optional: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SVD
n_components = 9  # Choose the number of components
svd = TruncatedSVD(n_components=n_components)
data_svd = svd.fit_transform(X_scaled)

data_svd = pd.DataFrame(data_svd, columns=[f"PC{i}" for i in range(1, n_components + 1)])
data_svd = pd.concat([data_svd, y], axis=1)

data_svd.to_csv('data/data_svd.csv', index=False)

data_svd

### Split data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

all_inputs = data_pca.iloc[:, :9]
all_labels = data_pca.loc[:, 'Label']

(training_inputs,
 testing_inputs,
 training_classes,
 testing_classes) = train_test_split(all_inputs, all_labels, test_size=0.25, random_state=1)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

# Create the classifier
decision_tree_classifier = DecisionTreeClassifier()

# Train the classifier on the training set
decision_tree_classifier.fit(training_inputs, training_classes)

# print the score
print(decision_tree_classifier.score(testing_inputs, testing_classes))

# Get the predictions
predictions = decision_tree_classifier.predict(testing_inputs)

conf_matrix = confusion_matrix(testing_classes, predictions)

# Step 4: Extract TP, TN, FP, FN
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]
TP = conf_matrix[1, 1]

# Print the values
print("True Negatives (TN):", TN)
print("True Positives (TP):", TP)
print("False Negatives (FN):", FN)
print("False Positives (FP):", FP)

# Visualize the tree
plt.figure(figsize=(30, 30))
plot_tree(decision_tree_classifier, filled=True, feature_names=all_inputs.columns)
plt.show()