# CDN Client Error dataset

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from google.colab import files

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Acquire data

The Python Pandas packages helps us work with our datasets. We start by acquiring the training and testing datasets into Pandas DataFrames. We also combine these datasets to run certain operations on both datasets together.

In [None]:
# Download the files (useful when using Google Colab)

uploaded = files.upload()

Saving CDN Client Error.csv to CDN Client Error.csv


# Ensemble Anomaly Detector

In [None]:
df_model=df.fillna(-1)

In [None]:
df_model.isnull().sum()

channel_id      0
timestamp       0
host_id         0
content_type    0
protocol        0
content_id      0
geo_location    0
user_id         0
dtype: int64

In [None]:
!pip install kmodes

Collecting kmodes
  Downloading kmodes-0.11.1-py2.py3-none-any.whl (19 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.11.1


In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.cluster import KMeans,DBSCAN
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score,davies_bouldin_score

features=['channel_id','host_id', 'content_type', 'protocol','content_id', 'geo_location', 'user_id']


def compute_score(inertia,silhouette_euclidian,silhouette_manhattan,silhouette_cosine,davies_bouldin):
  return silhouette_euclidian+silhouette_manhattan +silhouette_cosine+ 1/np.log(inertia)+ 1/davies_bouldin

n_clusters=[i+2 for i in range(10)]
scores=[]
for n_cluster in n_clusters:
  k_modes=KModes(n_cluster)
  df_trunc=df_model[features].iloc[:5000].astype("category")
  pred=k_modes.fit_predict(df_trunc)
  
  inertia=k_modes.cost_
  print(inertia)
  silhouette_euclidian=silhouette_score(df_trunc,pred)
  silhouette_manhattan=silhouette_score(df_trunc,pred,metric="manhattan")
  silhouette_cosine=silhouette_score(df_trunc,pred,metric="cosine")
  davies_bouldin=davies_bouldin_score(df_trunc,pred)

  scores.append(compute_score(inertia,silhouette_euclidian,silhouette_manhattan,silhouette_cosine,davies_bouldin))

print(f"Scores Computed {scores}")
print(np.argmax(scores)+2)



# KMeans - KPrototype - Kmode  


model_1=IsolationForest()
model_2=OneClassSVM()
model_dbscan=DBSCAN()


pred_1=model_1.fit_predict(df_model[features].iloc[:50000])
pred_2=model_2.fit_predict(df_model[features].iloc[:50000])
pred_dbscan=model_dbscan.fit_predict(df_model[features].iloc[:50000])



In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.cluster import KMeans,DBSCAN
from sklearn.metrics import silhouette_score,davies_bouldin_score

features=['channel_id','host_id', 'content_type', 'protocol','content_id', 'geo_location', 'user_id']


def compute_score(inertia,silhouette_euclidian,silhouette_manhattan,silhouette_cosine,davies_bouldin):
  return silhouette_euclidian+silhouette_manhattan +silhouette_cosine+ 1/np.log(inertia)+ 1/davies_bouldin

n_clusters=[i+2 for i in range(10)]
scores=[]
for n_cluster in n_clusters:
  k_mean=KMeans(n_cluster)
  df_trunc=pd.get_dummies(df_model[features].iloc[:20000].astype("category"))
  pred=k_mean.fit_predict(df_trunc)
  
  inertia=k_mean.inertia_
  silhouette_euclidian=silhouette_score(df_trunc,pred)
  silhouette_manhattan=silhouette_score(df_trunc,pred,metric="manhattan")
  silhouette_cosine=silhouette_score(df_trunc,pred,metric="cosine")
  davies_bouldin=davies_bouldin_score(df_trunc,pred)
  scores.append(compute_score(inertia,silhouette_euclidian,silhouette_manhattan,silhouette_cosine,davies_bouldin))

print(f"Scores Computed {scores}")
print(np.argmax(scores)+2)

plt.plot(n_clusters,scores,"-o")
plt.xlabel("Number of Clusters")
plt.ylabel("Custom Evaluation Score")



In [None]:
len(pred_1[pred_1==-1])/len(pred_1)

In [None]:
len(pred_2[pred_2==-1])/len(pred_2)

In [None]:
len(pred_dbscan[pred_dbscan==-1])/len(pred_dbscan)

In [None]:
def consus_anomaly(pred_1,pred_2,pred_3): 
  preds=[]
  for p_1,p_2,p_3 in zip(pred_1,pred_2,pred_3):
    #print(p_1)
    if p_1==-1 and p_2==-1 and p_3==-1:
      preds.append(1)
    else:
      preds.append(0)
  return np.array(preds)


In [None]:
preds=consus_anomaly(pred_1,pred_2,pred_dbscan)

In [None]:
len(preds[preds==1])

In [None]:
len(preds[preds==1])/len(preds)

In [None]:
features = ['channel_id', 'host_id', 'content_type', 'protocol', 'content_id', 'geo_location', 'user_id']
df = df.fillna(-1)
np.savetxt(r'c:\data\np.txt', df[features].values, fmt='%d')

In [None]:
df[features].values

# AutoEncoders

In [None]:
from keras.models import Model
from keras.layers import Input,Dense,Dropout
from keras.losses import CosineSimilarity



features=['channel_id','host_id', 'content_type', 'protocol','content_id', 'geo_location', 'user_id']

df_model_ohe=pd.get_dummies(df_model[features])

inp=Input(df_model_ohe.shape[1])

enc=Dense(256)(inp)
enc=Dense(256)(enc)

latent_space=Dense(15)(enc)

dec=Dense(256)(latent_space)
dec=Dense(256)(dec)
out=Dense(df_model_ohe.shape[1])(dec)

model=Model(inp,out)

model.compile(loss="mse",optimizer="adam",metrics=["mse"])

model.fit(df_model_ohe,df_model_ohe,batch_size=32,epochs=10,validation_split=0.2)