In [3]:
#Important libraries
from zipfile import ZipFile
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import  OneHotEncoder

#Optimization tools
from cvxopt import solvers, matrix, spmatrix, sparse

In [None]:
#Synthetic dataset
synth_train = pd.read_csv('/content/drive/MyDrive/Degree Project/train3.zip', delimiter=',')
synth_test = pd.read_csv('/content/drive/MyDrive/Degree Project/test3.zip', delimiter=',')
def data(df):
  Y = df['Label']                                  #Labels
  Y = Y.to_numpy()
  X = df.to_numpy()
  X = np.delete(X, -1, axis=1)
  X1 = X[:,[0,1,3,5]]                                    #Categorical Features
  X2 = X[:, [2,4,6]]                                     #Numerical Features
  X1 = OneHotEncoder().fit_transform(X1).toarray() #Encode categorical features
  X = np.concatenate((X1,X2), axis=1)
  X = np.asarray(X).astype('float32')
  Y = np.asarray(Y).astype('float32')
  return X,Y

X_train, Y_train = data(synth_train)
X_test, Y_test = data(synth_test)

In [None]:
#Synthetic dataset
synth_train = pd.read_csv('/content/drive/MyDrive/Degree Project/train3.zip', delimiter=',')
synth_test = pd.read_csv('/content/drive/MyDrive/Degree Project/test3.zip', delimiter=',')
def data(df):
  Y = df['Label']                                  #Labels
  Y = Y.to_numpy()
  X = df.to_numpy()
  X = np.delete(X, -1, axis=1)
  X1 = X[:,0:2]                                    #Categorical Features
  X2 = X[:, 2:]                                    #Numerical Features
  X1 = OneHotEncoder().fit_transform(X1).toarray() #Encode categorical features
  X = np.concatenate((X1,X2), axis=1)
  X = np.asarray(X).astype('float32')
  Y = np.asarray(Y).astype('float32')
  return X,Y

X_train, Y_train = data(synth_train)
X_test, Y_test = data(synth_test)

In [6]:
#Read datasets
#KDD-Cup 99 dataset
KDD_train = pd.read_csv('/content/drive/MyDrive/Degree Project/kddcup.data_10_percent.gz', compression='gzip').dropna() 
KDD_test = pd.read_csv('/content/drive/MyDrive/Degree Project/corrected.gz', compression='gzip').dropna()
KDD_train.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome'
]

KDD_test.columns = KDD_train.columns
#Remove duplicates
KDD_train = KDD_train.drop_duplicates(keep=False)
KDD_test = KDD_test.drop_duplicates(keep=False)
#Change outcome to 0 if normal and 1 if anomalous
KDD_train['outcome'] = (KDD_train['outcome']!='normal.')*1
KDD_test['outcome'] = (KDD_test['outcome']!='normal.')*1
def data(df):
  X = df.to_numpy()
  Y = X[:,-1]
  X = np.delete(X, -1, axis=1)
  X1 = np.array([X[:,0]]).transpose()
  X2 = OneHotEncoder().fit_transform(X[:,1:2]).toarray()
  X3 = X[:,4:]
  X = np.concatenate((X1,X2,X3), axis=1)
  X = np.asarray(X).astype('float32')
  Y = np.asarray(Y).astype('float32')
  normal_indx = np.where(Y==0)                      #Index of normal observations
  anomaly_indx = np.where(Y==1)                     #Index of anomalies
  X_normal  = X[normal_indx]
  X_anomaly = X[anomaly_indx]
  Y_normal  = Y[normal_indx]
  Y_anomaly = Y[anomaly_indx]
  return X,X_normal,X_anomaly,Y,Y_normal,Y_anomaly

X_train,X_normal_train,X_anomaly_train,Y_train,Y_normal_train,Y_anomaly_train = data(KDD_train)
X_test,X_normal_test,X_anomaly_test,Y_test,Y_normal_test,Y_anomaly_test       = data(KDD_test)

In [7]:
train_observations = len(KDD_train)
test_observations = len(KDD_test)
train_normal = np.sum(KDD_train['outcome']==0)
test_normal = np.sum(KDD_test['outcome']==0)
train_anomalous = train_observations - train_normal
test_anomalous = test_observations - test_normal
train_imbalance = round(train_anomalous/train_observations*100,2)
test_imbalance = round(test_anomalous/test_observations*100,2)

data = {'Observations': [train_observations, test_observations], 'Normal': [train_normal, test_normal], 'Anomalous': [train_anomalous, test_anomalous], 'Imbalance': [train_imbalance,test_imbalance]}
KDD_train_info = pd.DataFrame(data, columns=['Observations', 'Normal', 'Anomalous', 'Imbalance'], index=['Train', 'Test'])
print(KDD_train_info)

       Observations  Normal  Anomalous  Imbalance
Train        126208   85595      40613      32.18
Test          66436   47225      19211      28.92


In [None]:
#Returns the optimal transport distance between two sets
def Optimal_Transport(X1,X2):
  c = np.empty(X1.shape[0]*X2.shape[0])
  indx = 0
  for i,x1 in enumerate(X1):
    for j,x2 in enumerate(X2):
      c[indx] = np.linalg.norm(x2-x1,2)**2
      indx += 1 
  c = matrix(c/np.max(c))
  A1 = np.kron(np.eye(X1.shape[0]),np.ones(X2.shape[0]))
  A2 = np.zeros((X2.shape[0], X1.shape[0]*X2.shape[0]))
  for i in range(0,X2.shape[0]):
    for j in range(0,X1.shape[0]*X2.shape[0],X2.shape[0]):
      A2[i,i+j] = 1
  A2[X2.shape[0]-1,X1.shape[0]*X2.shape[0]-1]=0 #Prevents A from being a singular matrix
  A = matrix(np.concatenate((A1, A2), axis=0))
  G = -spmatrix(1, range(X1.shape[0]*X2.shape[0]), range(X1.shape[0]*X2.shape[0]))
  b = matrix(np.concatenate(((1/X1.shape[0])*np.ones([X1.shape[0]]),(1/X2.shape[0])*np.ones([X2.shape[0]])), axis=0))
  h = matrix(np.zeros(X1.shape[0]*X2.shape[0]))

  solvers.options['reltol'] = 1e-4
  solvers.options['abstol'] = 1e-4
  solvers.options['feastol'] = 1e-4
  sol = solvers.lp(c,G,h,A,b)

  return sol

In [None]:
#Sample from KDD
indx1 = np.random.choice(train_normal, 684, replace=False)
indx2 = np.random.choice(train_anomalous, 218, replace=False)
X1 = X_normal_train[indx1]
X2 = X_anomaly_train[indx2]

In [None]:
#Sample from Synthetic
indx1 = np.random.choice(10**6, 1500, replace=False)
indx2 = np.random.choice(750, 75, replace=False)+10**6
X1 = X_train[indx1]
X2 = X_train[indx2]

In [None]:
sol = Optimal_Transport(X1,X2)
print(sol['primal objective'])

     pcost       dcost       gap    pres   dres   k/t
 0:  7.5946e-02  7.5946e-02  2e+05  3e+02  2e+01  1e+00
 1: -1.6933e+00 -1.7061e+00  5e+03  8e+00  3e-01  1e-02
 2: -8.3601e-01 -8.3693e-01  1e+02  4e-01  2e-02  4e-04
 3: -4.5535e-01 -4.5562e-01  4e+01  2e-01  1e-02  4e-04
 4: -1.4835e-01 -1.4835e-01  4e+00  5e-02  2e-03  2e-04
 5: -4.2462e-02 -4.2461e-02  7e-01  2e-02  8e-04  6e-05
 6:  4.9869e-03  4.9867e-03  2e-01  7e-03  3e-04  2e-05
 7:  7.9667e-03  7.9651e-03  1e-01  6e-03  3e-04  2e-05
 8:  1.9508e-02  1.9507e-02  5e-02  3e-03  1e-04  7e-06
 9:  2.0859e-02  2.0857e-02  4e-02  2e-03  1e-04  4e-06
10:  2.4520e-02  2.4519e-02  2e-02  9e-04  4e-05  1e-06
11:  2.4924e-02  2.4922e-02  1e-02  6e-04  3e-05  7e-07
12:  2.5810e-02  2.5810e-02  3e-03  2e-04  8e-06  1e-07
13:  2.5968e-02  2.5968e-02  2e-03  1e-04  5e-06  5e-08
14:  2.6053e-02  2.6053e-02  8e-04  5e-05  2e-06  2e-08
15:  2.6090e-02  2.6090e-02  3e-04  2e-05  9e-07  7e-09
16:  2.6104e-02  2.6104e-02  1e-04  8e-06  4e-07  