In [None]:
!pip install umap-learn
!pip install hdbscan

In [None]:
# !apt-get update

In [None]:
!apt-get install texlive texlive-xetex texlive-latex-extra pandoc

In [None]:
import sklearn
from sklearn.cluster import KMeans
import os
import sys
import matplotlib.pyplot as plt
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image as im
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
import umap.umap_ as umap
import hdbscan
from sklearn.cluster import AgglomerativeClustering

In [None]:
def print_metrics(predicts):
  print("Rand Score: ",adjusted_rand_score(packet['type'], predicts))
  print("Mutial Info Score: ",adjusted_mutual_info_score(packet['type'], predicts))
  print("Homogeneity: ",metrics.homogeneity_score(packet['type'], predicts))
  print("Completenetss: ",metrics.completeness_score(packet['type'], predicts))
  print("V_Measure(beta=0.1): ",metrics.v_measure_score(packet['type'], predicts,beta=0.1))
  print("V_Measure(beta=0.2): ",metrics.v_measure_score(packet['type'], predicts,beta=0.2))
  print("V_Measure(beta=0.5): ",metrics.v_measure_score(packet['type'], predicts,beta=0.4))
  print("V_Measure: ",metrics.v_measure_score(packet['type'], predicts))
  print("\n")

In [None]:
def draw_newconfmatrix(predicts):
  df_confusion = pd.crosstab(packet['type'], predicts, rownames=['Actual'], colnames=['Predicted'])
  # df_confusion
  plt.matshow(df_confusion, cmap='Blues') # imshow
  plt.title("Test")
  plt.colorbar()
  tick_marks = np.arange(len(df_confusion.columns))
  ytick_marks = np.arange(len(df_confusion.index))
  plt.xticks(tick_marks, df_confusion.columns)
  plt.yticks(ytick_marks, df_confusion.index)
  # plt.tight_layout()
  plt.ylabel(df_confusion.index.name)
  plt.xlabel(df_confusion.columns.name)

In [None]:
def draw_confmatrix(predicts):
  """Draw Confusion Matrix"""
  cm = confusion_matrix(packet['type'], predicts.astype(str))
  ConfusionMatrixDisplay(cm).plot()

In [None]:
# Finding best K for clustering with Elbow Method
def bestkm(featuresvectore):
  fig, ax = plt.subplots()

  visualizer = KElbowVisualizer(KMeans(), k=(1,11),ax=ax)
  visualizer.fit(featuresvectore)

  ax.set_xticks(range(1,11))
  visualizer.show()
  plt.show()
  print(f'Elbow Value is {visualizer.elbow_value_}')

  # Clustering
  kmeans = AgglomerativeClustering(n_clusters = visualizer.elbow_value_)
  kmeans.fit(featuresvectore)
  return kmeans.labels_

In [None]:
n = 5000 # no. of packets, we have to update this number acording to dataset size.

In [None]:
data = pd.read_csv('GSM5000-hex.csv', dtype=str) # dataset name.

mtypes = data['type'].values
direction = data['direction'].values
bin = data['Full'].values

bin=bin[:n]
direction=direction[:n]
mtypes=mtypes[:n]

In [None]:
data['type'].value_counts() # no. of packets per class in dataset.

In [None]:
packet = {
    'bin' : [],
    'extractedfeature' : [],
    'length' : np.zeros(n),
    'lengroup' : [],
    'direction' : [],
    'type' : [],
    }

In [None]:
# calculating length of each packet.
for l in range(n):
  packet['length'][l]=len(bin[l])

In [None]:
packet["type"]=mtypes

In [None]:
# cluster packets to 4 clusters based on length of each packet.
kmeans = KMeans(n_clusters=4, random_state=0).fit(packet['length'].reshape(-1,1))
packet['lengroup'] = kmeans.labels_
# you can see the labels with:
# print(packet['length'])
# print(packet['lengroup'])

In [None]:
# encoding direction to one hot encoding
ohe = OneHotEncoder()
transformed = ohe.fit_transform(direction.reshape(-1, 1))
packet['direction']=transformed.toarray()

In [None]:
# encoding length groups to one hot encoding
ohe = OneHotEncoder()
transformed = ohe.fit_transform(packet['lengroup'].reshape(-1,1))
packet['lengroup']=transformed.toarray()

In [None]:
listofbins=[]
lens=[]
maxlen=int(max(packet['length']))
for i in range(bin.shape[0]):
  # print(i)
  if (packet['length'][i]) < maxlen:
    bin[i]=list(bin[i])
    bin[i] = bin[i] + [0] * max(0, maxlen - len(bin[i]))
  # print(len(bin[i]))
  listofbins.append(list(bin[i]))
  # lens.append(len(list(bin[i])))
# print(lens)
# print(max(lens))

    # print(len(j))
  # print(list(bin[i]))
packet['bin'] = np.array(listofbins,dtype=np.int32)

In [None]:
# verify shape of each packet.
print(packet['direction'].shape)
print(packet['type'].shape)
print(packet['bin'].shape)
print(packet['length'].shape)
print(packet['lengroup'].shape)

In [None]:
# apply transfer learning by using Resnet50 as pretrained model
MyModel = tf.keras.models.Sequential()
MyModel.add(tf.keras.applications.ResNet50(
    include_top = False, weights='imagenet', pooling='avg',
))

In [None]:
# freezing weights for 1st layer
MyModel.layers[0].trainable = False

In [None]:
# resize images to 224 x 224
img = []
for g in range(len(packet['bin'])):
  img.append(cv2.resize(packet['bin'][g].astype('uint8'), (224, 224)))

In [None]:
print(len(img)) # (n)
print(img[0].shape) # 224 ,224

In [None]:
for i in range(len(img)):
  img[i] = img[i].reshape(224,224,1)

In [None]:
# verify image size
print(img[0].shape) # (224, 224, 1)
print(len(img)) # n

In [None]:
# convert Grayscale images to RGB
color_img =[]
for i in range(len(img)):
  color_img.append(cv2.cvtColor(img[i],cv2.COLOR_GRAY2RGB))

In [None]:
len(color_img) # (n)

In [None]:
for i in range(len(color_img)):
  color_img[i] = tf.keras.applications.resnet50.preprocess_input(color_img[i])

In [None]:
color_img_expanded = []
for i in range(len(color_img)):
  color_img_expanded.append(np.expand_dims(color_img[i], 0))

In [None]:
# verify images and sizes
print(len(color_img_expanded)) # n 1275
print(color_img_expanded[0].shape) # (1, 224, 224, 3)

In [None]:
for i in range(len(color_img_expanded)):
  # print(f'{i+1}/{n}')
  extractedFeatures = MyModel.predict(color_img_expanded[i])
  extractedFeatures = np.array(extractedFeatures)
  packet["extractedfeature"].append(extractedFeatures.flatten())
packet['extractedfeature'] = np.array(packet['extractedfeature'], dtype = 'float64')

In [None]:
print(packet['extractedfeature'].shape)

In [None]:
cnn_umap = umap.UMAP(
    n_neighbors=20,
    min_dist=0.0,
    n_components=10,
    random_state=42,
).fit_transform(packet['extractedfeature'])

In [None]:
cnn_d = np.append(packet['extractedfeature'],packet['direction'],axis=1)
cnn_dl = np.append(cnn_d,packet['lengroup'],axis=1)
cnn_umap_d = np.append(cnn_umap,packet['direction'],axis=1)
cnn_umap_dl = np.append(cnn_umap_d,packet['lengroup'],axis=1)

print(packet['extractedfeature'].shape)
print(cnn_d.shape)
print(cnn_dl.shape)
print(cnn_umap.shape)
print(cnn_umap_d.shape)
print(cnn_umap_dl.shape)

In [None]:
km_cnn_umap_dl = bestkm(cnn_umap_dl)

In [None]:
min_cluster_size = int(n/100)#n/100，意思是要求每个簇至少有总样本量的 1%，避免出现太小的簇。
print(min_cluster_size)
min_samples = 10#每个点至少需要10个邻居点来被认为是核心点，值越大，聚类结果越“紧”，对噪声更敏感。
hd_cnn_umap_dl = hdbscan.HDBSCAN(min_cluster_size,min_samples).fit_predict(cnn_umap_dl)

In [None]:
print("HDBSCAN CNN-UMAP-Direction-Length")
print(str(len(set(hd_cnn_umap_dl))))
print_metrics(hd_cnn_umap_dl)

In [None]:
print("HDBSCAN CNN-UMAP-Direction-Length")
draw_newconfmatrix(hd_cnn_umap_dl)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Go to the directory where your file is
%cd gdrive/MyDrive/Colab\ Notebooks
# Verify that now you see your notebooks
!ls

In [None]:
# Convert
!jupyter nbconvert --to pdf newcnnpre.ipynb