In [11]:
# IMPORTING REQUIRED PACKAGES
import pandas as pd
from scipy.io import arff
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from random import randint, random
import sys

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from imblearn.over_sampling import SMOTE

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [6]:
from sklearn.metrics import log_loss
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

## Multi class classification

In [7]:
column_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class','difficulty']

In [33]:
class_dict = {'DOS':0, 'R2L':1, 'PROBE':2, 'U2R':3, 'NORMAL':4}  # custom labels hard coded

# Reaing and Preprocessing
def read():
    train = pd.read_csv(r'../../Datasets/NSL_KDD/KDDTrain+.txt', sep=',',header = None, names = column_names) 
    train.drop(['difficulty'],axis=1,inplace=True)
    change_label(train)
    train_x = train[train.columns[:-1]]
    normalization(train_x)
    train_x = one_hot(train_x)
    train_y = train[train.columns[-1]] 
    test = pd.read_csv(r'../../Datasets/NSL_KDD/KDDTest+.txt', sep=',',header = None, names = column_names)
    test.drop(['difficulty'],axis=1,inplace=True)
    change_label(test)
    test_x = test[test.columns[:-1]]
    normalization(test_x)
    test_x = one_hot(test_x)
    test_y = test[test.columns[-1]] 
    total_columns = list(set(train_x).union(set(test_x)))
    total_columns.sort() 
    for j in set(total_columns)-set(train_x):
        train_x[j] = 0.0
    for j in set(total_columns)-set(test_x):
        test_x[j] = 0.0
    train_x = train_x[total_columns]
    test_x = test_x[total_columns]
    return train_x,train_y,test_x,test_y

def change_label(df): # 5-classes including normal
    df['class'].replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm',
                         'worm'],'DOS',inplace=True)
    df['class'].replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
       'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
    df['class'].replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'PROBE',inplace=True)
    df['class'].replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)
    df['class'].replace(['normal'],'NORMAL',inplace=True)
    df['class'] = [class_dict[i] for i in df['class']]
    
def one_hot(df): # 3 categorical variables
    category_columns = ['protocol_type','service','flag']
    categorical = df[category_columns]
    categorical = pd.get_dummies(categorical,columns = category_columns)
    df = pd.concat([df, categorical], axis=1, join='inner')
    df = df[list(set(df.columns) - set(category_columns))]
    return df
    
def normalization(df): #Normalization
    std_scaler = StandardScaler()
    numeric_col = df.select_dtypes(include='float').columns
    df[numeric_col] = StandardScaler().fit_transform(df[numeric_col])
    print("finished")

In [47]:
def correlation_finder(): # Initialized optimal features using correlations
    train_x, train_y, test_x, test_y = read()
    s = correlation(train_x,0.90)
    ans = set()
    for x in range(20):
        temp_ans = set()
        for i in s:
            temp_ans.add(i[randint(0, 1)])
        ans.add(frozenset(temp_ans))
    df = train_x.copy()
    df['class'] = train_y 
    ts = correlation_test(df,0.50)-{'class'}
    lis_col = list(train_x.columns)
    pakka = []
    for i in ts:
        pakka.append(lis_col.index(i))
    popu = []
    for i in ans:
        ans = [0]*len(lis_col)
        for j in pakka:
            ans[j]=1
        for j in i:
            ans[lis_col.index(j)] = 1
        popu.append(ans)
    return popu

def correlation(df, threshold):
    col_corr = set()  # Set of all pairs of correlated columns as sets
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] > threshold: # we are interested in absolute coeff value
                colname1 = corr_matrix.columns[i] 
                colname2 = corr_matrix.columns[j]
                col_corr.add((colname1,colname2))
    return col_corr

def correlation_test(df, threshold):
    col_corr = set()  # Set of all pairs of correlated columns as sets
    corr_matrix = df.corr()
    for i in corr_matrix.columns:
        if corr_matrix[i]['class'] > threshold:
            col_corr.add(i)
    return col_corr

In [None]:
def get_selected_features_data(individual, x): # selects features based on individual
    col_names = x.columns
    selected_columns = [col_names[i] for i in range(len(individual)) if(individual[i] == 1)] 
    selected_f = x[selected_columns].copy()
    return selected_f

In [None]:
def generate_population(population_size, len_x,col): # Population Generation
    ans = []
    for i in range(population_size):
        sf = [0] * len_x
        for i in range(len(sf)):
            sf[i] = randint(0, 1)
        ans.append(sf)
    popu = correlation_finder()
    for i in range(len(popu)):
        if i<population_size:
            ans[i] = popu[i]
    return ans

In [None]:
def predicted_y(class_prob): # Return predicted class labels
    predict_y = []
    for i in class_prob:
        predict_y.append(i.argmax())    
    return predict_y

In [None]:
def fitness(individual, clf, data): # Fitness function evaluation
    selected_train_x = get_selected_features_data(individual, data[0]) 
    selected_test_x = get_selected_features_data(individual, data[2])
    clf = clf.fit(selected_train_x.to_numpy(), data[1].values.tolist()) 
    class_prob = clf.predict_proba(selected_test_x.to_numpy())
    predict_y = predicted_y(class_prob)
    fitness_ans = 1-cohen_kappa_score(data[3],predict_y) # taking 1- because we are minimizing cost
    return fitness_ans

In [None]:
def calculate_class_miss_rate(test_y, predict_y): 
    miss = [0]*len(class_dict)
    total = [0]*len(class_dict)
    miss_percent = [0.0]*len(class_dict)
    rev_class_dict = {v: k for k, v in class_dict.items()} # reverse class dictionary of hard coded labels
    for i in range(len(test_y)):
        index = class_dict[rev_class_dict[test_y[i]]]
        total[index] += 1
        if test_y[i] != predict_y[i]:
            miss[index] += 1
    for i in range(len(miss_percent)):
        miss_percent[i] = (100.00 * miss[i]) / total[i]
        if miss_percent[i]<=1: # to not collapse fitness to 0 unnecessarily because of one good accuracy class
            miss_percent[i] = 1
    return miss_percent

In [None]:
def average_fitness(population, clf, data): # Calculates average fitness for a population
    total_fitness = 0
    fitness_individual = []
    for i in population:
        total_fitness += fitness(i, clf, data)
        fitness_individual.append([fitness(i, clf, data),i])
    fitness_individual.sort(key=lambda x:x[0])
    return total_fitness / len(population),fitness_individual[0][1]

In [40]:
# Mutation, Crossover , Selection
def evolve(pop, clf, data, retain_percentage = 0.50, random_select = 0.05, mutate_prob = 0.01):
    f_values = [(fitness(i, clf, data), i) for i in pop] #i = [0,1,1,0,...]
    individuals = [i[1] for i in sorted(f_values)]
    retain_length = int(len(pop) * retain_percentage)
    parents = individuals[:retain_length]
    
    # randomly add other individuals to increase diversity
    for i in individuals[retain_length:]:
        if random_select > random():
            parents.append(i)
            
    # mutate
    for i in parents:
        if mutate_prob > random():
            index_to_mutate = randint(0, len(i) - 1)
            i[index_to_mutate] = randint(0, 1)
    
    # crossover
    no_of_parents = len(parents)
    remaining_no_of_ind = len(pop) - no_of_parents
    children = []
    
    while len(children) < remaining_no_of_ind:
        male_index = randint(0, no_of_parents - 1)
        female_index = randint(0, no_of_parents - 1)
        
        if male_index != female_index:
            male = parents[male_index]
            female = parents[female_index]
            half = int(len(male) / 2)
            child = male[:half] + female[half:]
            children.append(child)
    
    parents.extend(children)
    return parents

In [38]:
# Genetic Algorithm
def ga():
    train_x, train_y, test_x, test_y = read()
    smote = SMOTE()
    train_x, train_y = smote.fit_resample(train_x, train_y)
    data = [train_x, train_y, test_x, test_y]
    clf = tree.DecisionTreeClassifier()
    population_size = 10
    pop = generate_population(population_size, len(train_x[:1].values[0].tolist()),list(train_x.columns))  # Random
    fitness_list = []
    for i in range(250): # give sufficient iterations
        pop = evolve(pop, clf, data)
        population_fitness,best_individual = average_fitness(pop, clf, data) # average fitness for all individuals in a population
        fitness_list.append(population_fitness)
        print(i,"_pop_fitness : ",population_fitness)
        print(best_individual)
        if population_fitness < 0.1:
            break
    print (fitness_list)

In [39]:
ga()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished
0 _pop_fitness :  0.6826256777074563
[0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]
1 _pop_fitness :  0.6674907857569753
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]
2 _pop_fitness :  0.6608894184444416
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0

KeyboardInterrupt: 

### Classifiers with all features selected

In [41]:
def util(train_x, train_y, test_x, test_y,clf):
    clf = clf.fit(train_x.to_numpy(), train_y.values.tolist())
    class_prob = clf.predict_proba(test_x.to_numpy())
    predict_y = predicted_y(class_prob)
    miss_percent = calculate_class_miss_rate(test_y, predict_y)
    count = 1
    for i in miss_percent:
        print(count,"class no accuracy : ",100-i)
        count+=1
    fitness_ans = 1-cohen_kappa_score(test_y,predict_y) # taking 1- because we are minimizing cost
    target_names = ['DOS', 'R2L', 'PROBE', 'U2R', 'NORMAL']
    print(fitness_ans)
    print(classification_report(test_y,predict_y,target_names=target_names))

# Results on classifires on all features selcted
def traditional():
    train_x, train_y, test_x, test_y = read()
    smote = SMOTE()
    train_x, train_y = smote.fit_resample(train_x, train_y)
    print("DecisionTreeClassifier : ")
    clf = tree.DecisionTreeClassifier()
    util(train_x, train_y, test_x, test_y,clf)
    print("KNeighborsClassifier : ")
    clf = KNeighborsClassifier(n_neighbors=5)
    util(train_x, train_y, test_x, test_y,clf)
    print("RandomForestClassifier : ")
    clf = RandomForestClassifier()
    util(train_x, train_y, test_x, test_y,clf)
    print("MLPClassifier : ")
    clf = MLPClassifier(hidden_layer_sizes = [100]*5)
    util(train_x, train_y, test_x, test_y,clf)

In [42]:
traditional()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished
DecisionTreeClassifier : 
1 class no accuracy :  60.83109919571046
2 class no accuracy :  7.5909878682842304
3 class no accuracy :  46.551011978521274
4 class no accuracy :  2.985074626865668
5 class no accuracy :  96.11780455153949
0.5098458972846518
              precision    recall  f1-score   support

         DOS       0.88      0.61      0.72      7460
         R2L       0.41      0.08      0.13      2885
       PROBE       0.43      0.47      0.45      2421
         U2R       0.06      0.03      0.04        67
      NORMAL       0.66      0.96      0.78      9711

    accuracy                           0.68     22544
   macro avg       0.49      0.43      0.42     22544
weighted avg       0.67      0.68      0.64     22544

KNeighborsClassifier : 
1 class no accuracy :  75.36193029490616
2 class no accuracy :  21.90641247833622
3 class no accuracy :  64.97315159025196
4 class no accuracy :  44.776119402985074
5 class no accuracy :  96.21048295747092
0.37166746353088675


### Applying results of genetic algorithm

In [43]:
# Evaluation on the fittest individual obtained
def fittest(individual):
    train_x, train_y, test_x, test_y = read()
    smote = SMOTE()                            # Over sampling using smote
    train_x, train_y = smote.fit_resample(train_x, train_y)
    train_x = get_selected_features_data(individual, train_x)
    test_x = get_selected_features_data(individual,test_x)
    print("DecisionTreeClassifier : ")
    clf = tree.DecisionTreeClassifier()
    util(train_x, train_y, test_x, test_y,clf)
    print("KNeighborsClassifier : ")
    clf = KNeighborsClassifier(n_neighbors=5)
    util(train_x, train_y, test_x, test_y,clf)
    print("RandomForestClassifier : ")
    clf = RandomForestClassifier()
    util(train_x, train_y, test_x, test_y,clf)
    print("MLPClassifier : ")
    clf = MLPClassifier(hidden_layer_sizes = [100]*5)
    util(train_x, train_y, test_x, test_y,clf)

In [45]:
# Best result obtained from genetic algorithm
fittest([1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


finished
DecisionTreeClassifier : 
1 class no accuracy :  85.33512064343164
2 class no accuracy :  15.875216637781634
3 class no accuracy :  91.86286658405618
4 class no accuracy :  17.910447761194035
5 class no accuracy :  95.9221501390176
0.28261348824954824
              precision    recall  f1-score   support

         DOS       0.98      0.85      0.91      7460
         R2L       0.90      0.16      0.27      2885
       PROBE       0.77      0.92      0.84      2421
         U2R       0.03      0.18      0.05        67
      NORMAL       0.76      0.96      0.85      9711

    accuracy                           0.82     22544
   macro avg       0.69      0.61      0.58     22544
weighted avg       0.85      0.82      0.79     22544

KNeighborsClassifier : 
1 class no accuracy :  79.6916890080429
2 class no accuracy :  12.097053726169847
3 class no accuracy :  87.81495249896737
4 class no accuracy :  35.82089552238806
5 class no accuracy :  90.20698177324684
0.3584108887446472
  