$\huge \text{Synthetic data set and KDD 99 data set}$

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from timeit import default_timer as timer

In [None]:
def compute_Q(A):
  n=len(A) # number of data points
  Q_matrix=np.matmul(A.T,A)/n
  return Q_matrix

In [None]:
iteration_num=0
tolerance=1
matrix_decomposition=2
power_method=3

In [None]:
def compute_principal_direc_power(A, tol=1e-6, max_iter=1000):
  n = A.shape[0]
  x = np.ones(n) / np.sqrt(n)
  delta = 1

  for i in range(max_iter):
      x_new = A @ x
      x_new = x_new / np.linalg.norm(x_new)
      delta = np.linalg.norm(x_new - x)
      x = x_new

      if delta < tol:
          break

  return x

In [None]:

def compute_cosine(a,b):
  assert type(a) is np.ndarray and type(b) is np.ndarray
  return (np.dot(a,b))/(np.linalg.norm(a)*np.linalg.norm(b))

In [None]:
def compute_thres(score,k=2):
  a=np.array(score)
  return  np.mean(a)-k*np.std(a), np.mean(a)+k*np.std(a)

In [None]:
# let's define a seed value
seed=17

In [None]:
def guass_2d(mu,std,size):
  data=[]
  np.random.seed(seed)
  for i in range(size):
    x=np.random.normal(mu,std)
    y=np.random.normal(mu,std)
    data.append(x)
    data.append(y)
  return pd.DataFrame(np.array(data).reshape(size,2))

In [None]:
def dchange(x):
  return np.array(list(x))

$\large \text{PCAOD: Principal Component Analysis Otlier Detection without Oversampling with LOO(leave one out) and with or without strategy}$

In [None]:
def PCAOD(data):
  start=timer()
  outlier_score=[]
  data_score={}
  Q_mat_init=compute_Q(np.array(data))
  init_dir=compute_principal_direc_power(Q_mat_init)
  for i in range(len(data)):

    data_rem=data.drop(i)

    Q_mat=compute_Q(np.array(data_rem))
    dir=compute_principal_direc_power(Q_mat)

    cosine_val=compute_cosine(init_dir,dir)

    outlier_scr=1-abs(cosine_val)

    outlier_score.append(outlier_scr)
    data_score[tuple(data.iloc[i])]=outlier_scr

  end=timer()

  return outlier_score,end-start






$\large \text{Over-sampling principal Component Analysis outlier detection(used oversampling technique)}$

In [None]:
def OPCAOD(dat,r=0.1):
  start=timer()
  n=len(dat) # number of data points
  dim=dat.shape[1]
  #data_score={}
  Q_mat_init=compute_Q(np.array(dat))
  mu_init=np.array(dat.mean())
  prin_dir_init=compute_principal_direc_power(Q_mat_init)
  #prin_dir_list=[]
  #cosine_sim_list=[]
  outlier_score_list=[]
  m=(1+r)
  for i in range(len(dat)):
    mu=((mu_init)+r*np.array(dat.iloc[i]))/m

    data_point=np.array(dat.iloc[i])
    cov_mat=(Q_mat_init/m)+((r/m)*np.outer(data_point,data_point))-np.outer(mu,mu)
    # finding principal direction
    prin_dir=compute_principal_direc_power(cov_mat)
    #prin_dir_list.append(prin_dir)
    # finding cosine similiraty (cos theta)
    cosine_sim=compute_cosine( prin_dir_init,prin_dir)
    #cosine_sim_list.append(cosine_sim)
    # h
    outlier_score=(1-abs(cosine_sim))
    outlier_score_list.append(outlier_score)
    #data_score[tuple(dat.iloc[i])]=outlier_score


  end=timer()
  #Sc=np.array(outlier_score_list)
  #mean=Sc.mean()
  #outlier_score_list=abs(Sc-mean)

  return Q_mat_init,mu_init,prin_dir_init,outlier_score_list,end-start

In [None]:
def online_anomaly(dat,Q_mat_init,mu_init,prin_dir_init,thres,r=0.1):
  m=1+r
  pred=[]
  for i in range(len(dat)):
    mu=((mu_init)+r*np.array(dat.iloc[i]))/m

    data_point=np.array(dat.iloc[i])
    cov_mat=(Q_mat_init/m)+((r/m)*np.outer(data_point,data_point))-np.outer(mu,mu)
    # finding principal direction
    prin_dir=compute_principal_direc_power(cov_mat)
    #prin_dir_list.append(prin_dir)
    # finding cosine similiraty (cos theta)
    cosine_sim=compute_cosine( prin_dir_init,prin_dir)
    #cosine_sim_list.append(cosine_sim)
    # h
    outlier_score=(1-abs(cosine_sim))

    if outlier_score<=thres[0] or outlier_score>=thres[1]:
      pred.append(1)
    else:
      pred.append(0)
    #data_score[tuple(dat.iloc[i])]=outlier_score
  return pred



In [None]:
data=pd.read_csv('kddcup.data_10_percent.gz',header=None)

$\huge \text{Preprocessing of the data:}$

In [None]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,tcp,http,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,tcp,http,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494018,0,tcp,http,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494019,0,tcp,http,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


In [None]:
dos_list=['back.','land.','neptune.','pod.','smurf.','teardrop.']
u2r_list=['buffer_overflow.','loadmodule.','perl.','rootkit.']
r2l_list=['ftp_write.','imap.','multihop.','phf.','guess_passwd.','spy.','warezmaster.','warezclient.']
probe_list=['ipsweep.','nmap.','portsweep.','satan.']

In [None]:
def attact(x):
  if x in dos_list:
    return 'dos'
  elif x in u2r_list:
    return 'u2r'
  elif x in r2l_list:
    return 'r2l'
  elif x in probe_list:
    return 'probe'
  else:
    return 'normal'

In [None]:
data['attact']=data[41]
data['attact']=data['attact'].apply(attact)

In [None]:
data=data.drop([1,2,3,41],axis=1)
data.columns=[i for i in range(38)]+['attack']

In [None]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,attack
0,0,181,5450,0,0,0,0,0,1,0,...,9,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0,normal
1,0,239,486,0,0,0,0,0,1,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal
2,0,235,1337,0,0,0,0,0,1,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal
3,0,219,1337,0,0,0,0,0,1,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal
4,0,217,2032,0,0,0,0,0,1,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,310,1881,0,0,0,0,0,1,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal
494017,0,282,2286,0,0,0,0,0,1,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal
494018,0,203,1200,0,0,0,0,0,1,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal
494019,0,291,1200,0,0,0,0,0,1,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal


In [None]:
data_normal=data[data['attack']=='normal']
data_dos=data[data['attack']=='dos']
data_u2r=data[data['attack']=='u2r']
data_r2l=data[data['attack']=='r2l']
data_probe=data[data['attack']=='probe']

In [None]:
data_dos

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,attack
7601,0,0,0,0,0,0,0,0,0,0,...,6,1.00,0.00,0.20,0.33,1.0,0.83,0.0,0.0,dos
7602,0,0,0,0,0,0,0,0,0,0,...,16,1.00,0.00,0.07,0.12,1.0,0.94,0.0,0.0,dos
7793,0,1032,0,0,0,0,0,0,0,0,...,3,0.02,0.02,0.02,0.00,0.0,0.00,0.0,0.0,dos
7794,0,1032,0,0,0,0,0,0,0,0,...,13,0.08,0.02,0.08,0.00,0.0,0.00,0.0,0.0,dos
7795,0,1032,0,0,0,0,0,0,0,0,...,23,0.14,0.02,0.14,0.00,0.0,0.00,0.0,0.0,dos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490960,0,28,0,0,3,0,0,0,0,0,...,96,0.38,0.01,0.38,0.00,0.0,0.00,0.0,0.0,dos
490961,0,28,0,0,3,0,0,0,0,0,...,97,0.38,0.01,0.38,0.00,0.0,0.00,0.0,0.0,dos
490962,0,28,0,0,3,0,0,0,0,0,...,98,0.38,0.01,0.38,0.00,0.0,0.00,0.0,0.0,dos
490963,0,28,0,0,3,0,0,0,0,0,...,99,0.39,0.01,0.39,0.00,0.0,0.00,0.0,0.0,dos


In [None]:
def att(x):
  if x=='normal':
    return 0
  else:
    return 1

In [None]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,attack
0,0,181,5450,0,0,0,0,0,1,0,...,9,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0,normal
1,0,239,486,0,0,0,0,0,1,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal
2,0,235,1337,0,0,0,0,0,1,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal
3,0,219,1337,0,0,0,0,0,1,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal
4,0,217,2032,0,0,0,0,0,1,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,310,1881,0,0,0,0,0,1,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal
494017,0,282,2286,0,0,0,0,0,1,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal
494018,0,203,1200,0,0,0,0,0,1,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal
494019,0,291,1200,0,0,0,0,0,1,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal


$ \large\text{Using training data set to compute initial direction and threshold value over score:}$

In [None]:
train_data=data_normal.sample(2000,random_state=34).reset_index(drop=True)

In [None]:
train_data['attack']=train_data['attack'].apply(att)

In [None]:
train_X=train_data[train_data.columns[:-1]]

In [None]:
Q_mat_init,mu_init,prin_dir_init,outlier_score_list,time=OPCAOD(train_X,r=0.1)

In [None]:
thr1,thr2=compute_thres(outlier_score_list)
thr=[thr1,thr2]

In [None]:
from sklearn.metrics import confusion_matrix

$\huge \text{Normal Vs Dos data samples}$

In [None]:
normal_sample=data_normal.sample(2000,random_state=34).reset_index(drop=True)
dos_sample=data_dos.sample(100,random_state=seed).reset_index(drop=True)
data=pd.concat([normal_sample,dos_sample]).reset_index(drop=True)
data['attack']=data['attack'].apply(att)
X=data[data.columns[:-1]]
y=data[data.columns[-1]]

$\huge \text{prediction:}$



In [None]:
pred=online_anomaly(X,Q_mat_init,mu_init,prin_dir_init,thr,r=0.1)

$\huge \text{Accuracy :}$

In [None]:
y_pred=np.array(pred)
y_actual=np.array(y)
acc=np.sum(y_pred==y_actual)/len(y_pred)
print('accuracy is :',acc)

accuracy is : 0.9514285714285714


In [None]:
A=confusion_matrix(y_actual,y_pred)

In [None]:

TPR=A[0,0]/(A[0,0]+A[1,0])
FPR=A[0,1]/(A[0,1]+A[1,0])
print('True positive rate (TPR):',TPR,'And False Positive rate :',FPR)

True positive rate (TPR): 0.9523355576739753 And False Positive rate : 0.0196078431372549


$\huge \text{Normal Vs Probe attack type data samples}$

In [None]:
normal_sample=data_normal.sample(2000,random_state=34).reset_index(drop=True)
data_test_probe=data_probe.sample(150,random_state=seed).reset_index(drop=True)
data=pd.concat([normal_sample,data_test_probe]).reset_index(drop=True)
data['attack']=data['attack'].apply(att)
X=data[data.columns[:-1]]
y=data[data.columns[-1]]

In [None]:
pred=online_anomaly(X,Q_mat_init,mu_init,prin_dir_init,thr,r=0.1)

In [None]:
y_pred=np.array(pred)
y_actual=np.array(y)
acc=np.sum(y_pred==y_actual)/len(y_pred)
print('accuracy is :',acc)

accuracy is : 0.9293023255813954


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
B=confusion_matrix(y_actual,y_pred)

In [None]:

TPR=B[0,0]/(B[0,0]+B[1,0])
FPR=B[0,1]/(B[0,1]+B[1,0])
print('True positive rate (TPR):',TPR,'and False Positive rate :',FPR)

True positive rate (TPR): 0.9301675977653632 and False Positive rate : 0.013157894736842105


$\huge \text{Normal Vs u2r attack type data samples}$

In [None]:
normal_sample=data_normal.sample(2000,random_state=34).reset_index(drop=True)
data_test_u2r=data_u2r.sample(49,random_state=seed).reset_index(drop=True)
data=pd.concat([normal_sample,data_test_u2r]).reset_index(drop=True)
data['attack']=data['attack'].apply(att)
X=data[data.columns[:-1]]
y=data[data.columns[-1]]

In [None]:
pred=online_anomaly(X,Q_mat_init,mu_init,prin_dir_init,thr,r=0.1)

In [None]:
y_pred=np.array(pred)
y_actual=np.array(y)
acc=np.sum(y_pred==y_actual)/len(y_pred)
print('accuracy is :',acc)

accuracy is : 0.9751098096632503


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
c_matrix=confusion_matrix(y_actual,y_pred)

In [None]:
A=c_matrix
TPR=A[0,0]/(A[0,0]+A[1,0])
FPR=A[0,1]/(A[0,1]+A[1,0])
print('True positive rate (TPR):',TPR,'And False Positive rate :',FPR)

True positive rate (TPR): 0.9760625305324866 And False Positive rate : 0.0392156862745098


$\huge \text{Normal Vs r2l attack type data samples}$

In [None]:
normal_sample=data_normal.sample(2000,random_state=34).reset_index(drop=True)
data_test_r2l=data_r2l.sample(85,random_state=seed).reset_index(drop=True)
data=pd.concat([normal_sample,data_test_r2l]).reset_index(drop=True)
data['attack']=data['attack'].apply(att)
X=data[data.columns[:-1]]
y=data[data.columns[-1]]

In [None]:
pred=online_anomaly(X,Q_mat_init,mu_init,prin_dir_init,thr,r=0.1)

In [None]:
y_pred=np.array(pred)
y_actual=np.array(y)
acc=np.sum(y_pred==y_actual)/len(y_pred)
print('accuracy is :',acc)

accuracy is : 0.9601918465227818


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
c_matrix=confusion_matrix(y_actual,y_pred)

In [None]:
c_matrix

array([[1998,    2],
       [  81,    4]])

In [None]:
A=c_matrix
TPR=A[0,0]/(A[0,0]+A[1,0])
FPR=A[0,1]/(A[0,1]+A[1,0])
print('True positive rate (TPR):',TPR,'And False Positive rate :',FPR)

True positive rate (TPR): 0.961038961038961 And False Positive rate : 0.024096385542168676
