In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
np.set_printoptions(precision=3, suppress=True)

In [46]:
def train_val_test_splitter(Xg, Xb, n_repeats=5, settings=[(0.98, 0.02)],):
    
    DATA = {}

    for setting in settings:

        DATA[setting] = {}

        for repeat in range(n_repeats):
            
            DATA[setting][repeat] = {}
            
            idx_norm = np.random.randint(low=0, high=Xg.shape[0], size=20000)
            Xg_ = Xg[idx_norm, :]

            n_abnormalities = int(Xg_.shape[0]/20)
            print("n_abnormalities:", n_abnormalities)
            
            idx_abnorm = np.random.randint(low=0, high=Xb.shape[0], size=n_abnormalities)
            Xb_ = Xb[idx_abnorm, :]
            
            Lg = np.repeat(int(1), Xg_.shape[0])  # Labels Good

            Xg_train, Xg_test, Lg_train, Lg_test = train_test_split(Xg_, Lg,
                                                                    test_size=setting[-1], 
                                                                    shuffle=True)

            Xg_test, Xg_val, Lg_test, Lg_val = train_test_split(Xg_test, Lg_test,
                                                                test_size=0.5, 
                                                                shuffle=True)

            Lb = np.repeat(int(0), Xb_.shape[0])  # Labels Bad 
            
            Xb_train, Xb_test, Lb_train, Lb_test = train_test_split(Xb_, Lb,
                                                                    test_size=setting[-1], 
                                                                    shuffle=True)

            Xb_test, Xb_val, Lb_test, Lb_val = train_test_split(Xb_test, Lb_test,
                                                                test_size=0.5, 
                                                                shuffle=True)

            X_train = np.concatenate((Xg_train, Xb_train), axis=0)
            X_val = np.concatenate((Xg_val, Xb_val), axis=0)
            X_test = np.concatenate((Xg_test, Xb_test), axis=0)


            L_train = np.concatenate((Lg_train, Lb_train), axis=0)
            L_val = np.concatenate((Lg_val, Lb_val), axis=0)
            L_test = np.concatenate((Lg_test, Lb_test), axis=0)


            DATA[setting][repeat]['X_tr'] = X_train
            DATA[setting][repeat]['X_vl'] = X_val
            DATA[setting][repeat]['X_ts'] = X_test
            DATA[setting][repeat]['y_tr'] = L_train
            DATA[setting][repeat]['y_vl'] = L_val
            DATA[setting][repeat]['y_ts'] = L_test
            
    return DATA


    # with open (os.path.join('SANC_computation', name+features_type+str(size)+'.pickle'), 'wb') as fp:
    #     pickle.dump(data, fp)


In [3]:
with open ("kddcup.data", 'r') as fp:
    KDD = fp.readlines()

In [4]:
print(type(KDD), len(KDD), len(KDD[0]))

<class 'list'> 4898431 147


In [5]:
KDD[0][0], KDD[0][1], KDD[0][2], KDD[0][-3], KDD[0][-2], KDD[0][-1]

('0', ',', 't', 'l', '.', '\n')

In [6]:
tmp_kdd = []
for line in range(len(KDD)):
    tmp_kdd.append(KDD[line].strip().split(","))

In [7]:
type(tmp_kdd), len(tmp_kdd), len(tmp_kdd[0])

(list, 4898431, 42)

In [8]:
abnormalities = 0
normals = 0
for i in tmp_kdd:
    if i[-1] != 'normal.':
        abnormalities += 1
    else:
        normals += 1 
        
print(abnormalities, normals, normals+abnormalities == len(tmp_kdd))

3925650 972781 True


In [9]:
internet_protocols = []
transfer_protocols = []
s_types = []
labels = []
categoricals = [1, 2, 3, 41]

In [10]:
for i in tmp_kdd:
    if i[1] not in internet_protocols:
        internet_protocols.append(i[1])
    if i[2] not in transfer_protocols:
        transfer_protocols.append(i[2])
    if i[3] not in s_types:
        s_types.append(i[3])
    if i[-1] not in labels:
        labels.append(i[-1])

In [11]:
labels

['normal.',
 'buffer_overflow.',
 'loadmodule.',
 'perl.',
 'neptune.',
 'smurf.',
 'guess_passwd.',
 'pod.',
 'teardrop.',
 'portsweep.',
 'ipsweep.',
 'land.',
 'ftp_write.',
 'back.',
 'imap.',
 'satan.',
 'phf.',
 'nmap.',
 'multihop.',
 'warezmaster.',
 'warezclient.',
 'spy.',
 'rootkit.']

In [12]:
kdd_90 = np.zeros([len(tmp_kdd), len(tmp_kdd[0])])
kdd_90.shape,

((4898431, 42),)

In [13]:
type(internet_protocols.index('icmp'))

int

In [14]:
r = 0
for row in tmp_kdd:
    c = 0
    for i in row:
        if not c in categoricals:
            kdd_90[r, c] = float(i)
        elif c == 1:
            kdd_90[r, c] = internet_protocols.index(i)
        elif c == 2:
            kdd_90[r, c] = transfer_protocols.index(i)
        elif c == 3:
            kdd_90[r, c] = s_types.index(i)
        elif c == 41:
            if i == 'normal.':
                kdd_90[r, c] = int(0)
            else:
                kdd_90[r, c] = int(1)
        c += 1
    r += 1

In [15]:
kdd_90.shape, type(kdd_90)

((4898431, 42), numpy.ndarray)

In [16]:
print(len(set(kdd_90[:, 1].tolist())) == len(internet_protocols),
len(set(kdd_90[:, 2].tolist())) == len(transfer_protocols),
len(set(kdd_90[:, 3].tolist())) == len(s_types), 
len(set(kdd_90[:, -1].tolist())) == len(labels))

True True True False


In [17]:
type(kdd_90[0, 1])

numpy.float64

In [18]:
# enc = OneHotEncoder(sparse=False, categories='auto') 
# categorical_features = enc.fit_transform(kdd_90[:, 1:4])
# categorical_features.shape, kdd_90[:, 4:-1].shape


In [19]:
# kdd_final = np.concatenate([categorical_features, kdd_90[:, 4:]], axis=1)
# kdd_final.shape

In [20]:
# Xg = kdd_final[np.where(kdd_final[:,-1]==0)].astype('float32')
# Xb_ = kdd_final[np.where(kdd_final[:,-1]!=0)].astype('float32')

In [32]:
Xg = kdd_90[np.where(kdd_90[:,-1]==0)].astype('float32')
Xb = kdd_90[np.where(kdd_90[:,-1]!=0)].astype('float32')

In [33]:
Xb.shape, Xg.shape

((3925650, 42), (972781, 42))

In [47]:
KDD_90 = train_val_test_splitter(Xg=Xg[:, :-1], Xb=Xb[:, :-1], settings=[(0.98, 0.02),])  

with open(os.path.join('matrices', 'KDD_90.pickle'), 'wb') as fp:
    pickle.dump(KDD_90, fp)

with open (os.path.join('matrices', 'KDD_90.pickle'), 'rb') as fp:
    tmp_ = pickle.load(fp)
    
    
l = 0
for settings, repeats in tmp_.items():
    print("setting:", settings)
    for repeat, matrices in repeats.items(): 
        print(l, repeat, matrices['X_tr'].shape, matrices['X_vl'].shape)
        l +=1

n_abnormalities: 1000
n_abnormalities: 1000
n_abnormalities: 1000
n_abnormalities: 1000
n_abnormalities: 1000
setting: (0.98, 0.02)
0 0 (20580, 41) (210, 41)
1 1 (20580, 41) (210, 41)
2 2 (20580, 41) (210, 41)
3 3 (20580, 41) (210, 41)
4 4 (20580, 41) (210, 41)
