In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [14]:
df = pd.read_csv('./training4tuplabeled.csv')
df.head()

Unnamed: 0,flowStartMilliseconds,sourceIPAddress,destinationIPAddress,sourceTransportPort,destinationTransportPort,packetTotalCount,label,sublabel
0,1639285200132,156.191.30.93,203.216.92.39,33642,6379,1,0,0
1,1639285200132,167.129.252.112,202.26.199.60,48534,62001,1,0,0
2,1639285200132,167.129.252.112,202.26.199.139,57560,62001,1,0,0
3,1639285200132,156.191.30.93,203.216.92.34,38050,6379,1,0,0
4,1639285200132,156.191.30.93,203.216.92.134,58038,6379,1,0,0


In [15]:
# count value in a dataframe column
def count_value(df, col):
    return df[col].value_counts()

print(count_value(df, 'sublabel'))
print(df.columns)
print(len(df))

0                                        821683
Potentially Bad Traffic                    3749
Generic Protocol Command Decode            1906
Attempted Information Leak                 1590
Potential Corporate Privacy Violation       112
Misc activity                                32
Misc Attack                                   5
Name: sublabel, dtype: int64
Index(['flowStartMilliseconds', 'sourceIPAddress', 'destinationIPAddress',
       'sourceTransportPort', 'destinationTransportPort', 'packetTotalCount',
       'label', 'sublabel'],
      dtype='object')
829077


In [16]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

le = LabelEncoder()
df['sublabel'] = df['sublabel'].astype(str)
# encoded = le.fit_transform(df['sublabel'])
le.fit(df['sublabel'])
tf = le.transform(df['sublabel'])
print(le.classes_)
# change column sublabel to encoded
np.save("classes.npy", le.classes_)
# df['sublabel'] = encoded
print(le.inverse_transform([0, 1, 2, 3, 4, 5, 6]))
print(df['sublabel'].value_counts())

# count each element in tf
print(np.bincount(tf))

['0' 'Attempted Information Leak' 'Generic Protocol Command Decode'
 'Misc Attack' 'Misc activity' 'Potential Corporate Privacy Violation'
 'Potentially Bad Traffic']
['0' 'Attempted Information Leak' 'Generic Protocol Command Decode'
 'Misc Attack' 'Misc activity' 'Potential Corporate Privacy Violation'
 'Potentially Bad Traffic']
0                                        821683
Potentially Bad Traffic                    3749
Generic Protocol Command Decode            1906
Attempted Information Leak                 1590
Potential Corporate Privacy Violation       112
Misc activity                                32
Misc Attack                                   5
Name: sublabel, dtype: int64
[821683   1590   1906      5     32    112   3749]


In [17]:
X = df[['sourceTransportPort', 'destinationTransportPort', 'packetTotalCount']]
y = df['sublabel']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

clf.predict(X_test)

KeyboardInterrupt: 

In [None]:
# Print accuracy
print(metrics.accuracy_score(y_test, clf.predict(X_test)))

0.9935687750277415


In [None]:
result = clf.predict(X_test).tolist()
expected_result = y_test.tolist()

# print when the value of the two list are different
def print_diff(result, expected_result):
    for i in range(len(result)):
        if result[i] != expected_result[i]:
            print(result[i], expected_result[i])
# print_diff(result, expected_result)
print(len(result))

207270


In [None]:
#print matrix of confusion
print(metrics.confusion_matrix(y_test, clf.predict(X_test)))

# sum value which are not on the diagonal
def sum_diagonal(matrix):
    sum = 0
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            if i == j:
                sum += matrix[i][j]
    return sum

print(len(y_test) - sum_diagonal(metrics.confusion_matrix(y_test, clf.predict(X_test))) )

[[204879    237    197      0      4     22    107]
 [   326     51      4      0      0      0      0]
 [   281      0    171      0      0      0      0]
 [     1      0      0      0      0      0      0]
 [     8      0      0      0      0      0      0]
 [    31      0      0      0      0      0      0]
 [   115      0      0      0      0      0    836]]
1333


In [None]:
from sklearn.model_selection import RandomizedSearchCV
# create a random grid of parameters
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8, 9, 10],
    'criterion': ['gini', 'entropy']
}
rd_clf = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=10, cv=3, random_state=42)
# fit the model
rd_clf.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [4, 5, 6, 7, 8, 9, 10],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   random_state=42)

In [None]:
# test the model
rd_clf.predict(X_test)

# save the model
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(rd_clf, f)

# Print accuracy
print(metrics.accuracy_score(y_test, rd_clf.predict(X_test)))

# matrix of confusion
print(metrics.confusion_matrix(y_test, rd_clf.predict(X_test)))

0.9945288753799392
[[205351      0     12      0      0      0     83]
 [   381      0      0      0      0      0      0]
 [   413      0     39      0      0      0      0]
 [     1      0      0      0      0      0      0]
 [     8      0      0      0      0      0      0]
 [    31      0      0      0      0      0      0]
 [   205      0      0      0      0      0    746]]


In [None]:
# from keras.models import Sequential
# from keras.layers import Dense

# # build a 3 layer neural network
# def build_classifier():
#     classifier = Sequential()
#     classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu', input_dim=3))
#     classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
#     classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
#     classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return classifier

# # train the model
# classifier = build_classifier()
# classifier.fit(X_train, y_train, batch_size=32, epochs=30)

In [None]:
# # save the model
# classifier.save('model.h5')

# # predict the test set result
# y_pred = classifier.predict(X_train)
# y_pred

In [None]:
# # count the number of time where list a is different from list b
# def count_diff(a, b):
#     count = 0
#     for i in range(len(a)):
#         if a[i] != b[i]:
#             count += 1
#     return count

In [None]:
encoder = LabelEncoder()
encoder.classes = np.load("classes.npy", allow_pickle=True)

print(encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6]))

NotFittedError: This LabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [13]:
import joblib
import sys
import pandas as pd

def preprocess(): 
    csv = pd.read_csv("stuff/output.csv")
    del csv['sublabel']
    csv.to_csv("stuff/output.csv", index=False)

# open pkl model
def open_model(model_path):
    model = joblib.load(model_path)
    return model

# predict
def predict(model, data):
    prediction = model.predict(data)
    return prediction

def loadRowData(): 
    rawData = pd.read_csv("stuff/output.csv")
    return rawData

def loadData(rawData): 
    data = rawData[['sourceTransportPort', 'destinationTransportPort', 'packetTotalCount']]
    return data

def writeSubmitColumn(rawData, prediction):
    print(type(prediction))
    rawData['sublabel'] = prediction
    for index, row in rawData.iterrows():
        if row['sublabel'] == 1:
            print(row['sublabel'])
            rawData.loc[index, 'sublabel'] = "Attempted Information Leak"
        elif row['sublabel'] == 2:
            print(row['sublabel'])
            rawData.loc[index, 'sublabel'] = "Generic Protocol Command Decode"
        elif row['sublabel'] == 3:
            print(row['sublabel'])
            rawData.loc[index, 'sublabel'] = "Misc Attack"
        elif row['sublabel'] == 4:
            print(row['sublabel'])
            rawData.loc[index, 'sublabel'] = "Command Decode"
        elif row['sublabel'] == 5:
            print(row['sublabel'])
            rawData.loc[index, 'sublabel'] = "Potential Corporate Privacy Violation"
        elif row['sublabel'] == 6:
            print(row['sublabel'])
            rawData.loc[index, 'sublabel'] = "Potentially Bad Traffic"
    rawData.to_csv("stuff/output.csv", index=False)


# preprocess()
rawData = loadRowData()
data = loadData(rawData)
model = open_model('stuff/model.pkl')
prediction = predict(model, data)
writeSubmitColumn(rawData, prediction)

<class 'numpy.ndarray'>
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
2
6
6
6
6
6
2
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
