<a href="https://colab.research.google.com/github/Rajesh231/Ml/blob/master/BotnetDetection_CNN_HighAccNdLowLoss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Botnet Detection using CTU-13 dataset


In [0]:
!pip install pyyaml h5py

In [0]:
import tensorflow as tf 
from tensorflow.keras import Sequential 
from tensorflow.keras.callbacks import EarlyStopping 
from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
from tensorflow.keras.layers import Conv1D, MaxPool1D,Flatten,Dense,Dropout,BatchNormalization 
print(tf.__version__) 

In [0]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 


In [0]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_selection import variance_threshold 

**Reading the CSV file from Mounted Google Drive**

In [0]:
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/finalpreprocessed2.csv") 
df.head() 

In [0]:
df.shape

**Preprocessing of the data which left after preprocessing in MS Excel**

In [0]:
X = df.drop(labels =['Label','State'],axis =1) 
X.shape

In [0]:
y = df['Label'] 

In [0]:
#stratified shuffling is done to divide the data in equal ration on the basis of parameter label
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0, stratify =y) 

In [0]:
X_train.shape

In [0]:
X_test.shape 

In [0]:
from sklearn.feature_selection import VarianceThreshold 
filter = VarianceThreshold(0.01) 
X_train = filter.fit_transform(X_train) 
X_test = filter.transform(X_test) 
X_train.shape, X_test.shape 

In [0]:
X_train_T = X_train.T
X_test_T = X_test.T

In [0]:
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)
X_test_T.shape

In [0]:
X_train_T.duplicated().sum()

In [0]:
duplicated_features = X_train_T.duplicated()
duplicated_features

In [0]:
features_to_keep = [not index for index in duplicated_features]
features_to_keep

In [0]:
X_train = X_train_T[features_to_keep].T
X_train.shape

In [0]:
X_test = X_test_T[features_to_keep].T
X_test.shape

In [0]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train 

In [0]:
X_test 

In [0]:
X_train.shape,X_test.shape 

In [0]:
X_train = X_train.reshape(830608,7,1) 
X_test = X_test.reshape(207653,7,1) 

In [0]:
X_test.shape,X_train.shape 

In [0]:
#as y is a series so y_train and y_test are also series therefore we need to convert them to a numpy array.
y_train = y_train.to_numpy() 
y_test = y_test.to_numpy() 

**Model buildin process starts from here. I will build this model using Convolutional Neural Networks**

In [0]:
from tensorflow.keras.layers import MaxPool1D
from keras.models import load_model
model = Sequential()
model.add(Conv1D(64,3,activation='relu',input_shape=(7,1)))
model.add(BatchNormalization())
model.add(MaxPool1D(pool_size =2))
model.add(Dropout(0.1))

model = Sequential()
model.add(Conv1D(128,3,activation='relu',input_shape=(7,1)))
model.add(BatchNormalization())
model.add(MaxPool1D(2))
model.add(Dropout(0.5))

model = Sequential()
model.add(Conv1D(128,3,activation='relu',input_shape=(7,1)))
model.add(BatchNormalization())
model.add(MaxPool1D(2))
model.add(Dropout(0.2))

model = Sequential()
model.add(Conv1D(256,3,activation='relu',input_shape=(7,1)))
model.add(BatchNormalization())
model.add(MaxPool1D(2))
model.add(Dropout(0.1))

model.add(Flatten())
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1,activation = 'sigmoid'))
#model.add(EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='min', baseline=None, restore_best_weights=True))


In [0]:
model.summary()

In [0]:
# compile the model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import ReduceLROnPlateau
#model.compile(optimizer=SGD(lr=0.000005),loss ="binary_crossentropy",metrics = ['f1_score'])
model.compile(optimizer=SGD(lr=0.01), loss='binary_crossentropy', metrics=['accuracy']) 

#simple early stopping and model chekcpoint functions
es = EarlyStopping(monitor='val_loss',mode = 'min',verbose =1,patience = 10)
mc = ModelCheckpoint('best_model.h5',monitor = 'val_loss', mode = 'min', verbose =1, save_best_only = True)
lr = ReduceLROnPlateau(monitor = "val_loss", factor = 0.1, patience = 4, verbose = 0, mode = "auto", min_delta= 1e-04, cooldown = 0,min_lr = 0)
# fit the model
#history = model.fit(Xtrain, ytrain, validation_split=0.3, epochs=10, verbose=0)

# evaluate the model
#loss, accuracy, f1_score, precision, recall = model.evaluate(Xtest, ytest, verbose=0)

In [0]:
history = model.fit(X_train,y_train,epochs = 225,validation_split=0.25 ,verbose =1,callbacks=[es,mc,lr])

In [0]:
# Recreate the exact same model, including its weights and the optimizer by loading our saved model.
model = tf.keras.models.load_model('best_model.h5')

# Show the model architecture
model.summary()

**Evaluating the loaded or saved model to have a glance over training accuracy, training loss and testing accuracy, testing loss**

In [0]:
#evaluating the model 
trainloss,trainacc = model.evaluate(X_train,y_train,verbose=0)
testloss,testacc = model.evaluate(X_test,y_test,verbose=0)
print('train: %.3f, %.3f, Test: % .3f, %.3f'%(trainacc,trainloss,testacc,testloss))

In [0]:
 y_predict = model.predict(X_test)

In [0]:
y_predict

In [0]:
history.history

In [0]:
print(history.history['accuracy'])

In [0]:
def plot_learningCurve(history,epoch):
  #ploting training & validation accuracy values
  epoch_range = range(1,epoch+1)
  plt.plot(epoch_range,history.history['accuracy'])
  plt.plot(epoch_range,history.history['val_accuracy'])
  plt.title('Model accuracy')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train','test'],loc='right')
  plt.show()

#plot training & validation loss values
  plt.plot(epoch_range,history.history['loss'])
  plt.plot(epoch_range,history.history['val_loss'])
  plt.title('Model loss')
  plt.ylabel('loss')
  plt.xlabel('Epoch')
  plt.legend(['Train','test'],loc='right')
  plt.show()

In [0]:
plot_learningCurve(history,49)

**By looking at the above learning curves we can say that our model is neither underfitting nor overfitting as the curves for both training and testing data are nearly same. If model was underfitting then these two curves would have huge variation and if model was overfitting then these two curves may be moving in opposite direction**

In [0]:
y_predict = np.asarray(y_predict)

In [0]:
from sklearn.metrics import confusion_matrix
conff = confusion_matrix(y_test,y_predict.round())
conff

In [0]:

#Plotting Confusion matrix
import itertools
import matplotlib.pyplot as plt
classes = [0,1]
plt.imshow(conff,interpolation='nearest',cmap=plt.cm.Greens)
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks,classes)
plt.yticks(tick_marks,classes)
accuracy = max(history.history['accuracy'])
misclass =1-max(history.history['accuracy'])
fmt ='d'
thresh = conff.max()/2
for i, j in itertools.product(range(conff.shape[0]),range(conff.shape[1])):
  plt.text(j,i,format(conff[i,j],fmt),horizontalalignment='center', color="white" if conff[i,j]>thresh else "black")
  plt.tight_layout()
  plt.ylabel('True label')
  #plt.xlabel('Predicted label')
  plt.xlabel('Predicted label ( 1-True,0-False )\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))

In [0]:
TP = true_positives = 4341
TN = true_negatives = 201790
FP = false_positives = 1051
FN = false_negatives = 471

In [0]:
results={}
metric = "Accuracy"
results[metric]=(TP+TN)/(TP+TN+FP+FN)
print(f"{metric} is {results[metric]: .4f}")

In [0]:
metric = "Recall"
results[metric]=TP/(TP+FN)
print(f"{metric} is {results[metric]: .4f}")

In [0]:
metric = "Precision"
results[metric]=TP/(TP+FP)
print(f"{metric} is {results[metric]: .4f}")

In [0]:
metric = "F1-Score"
results[metric]= 2/(1/results["Precision"]+1/results["Recall"])
print(f"{metric} is {results[metric]: .4f}")

In [0]:
#Let's check various metrics using sklearns inbuitl function to compare the values our custom functions generated
from sklearn import metrics
print(f"Actual accuracy_score : {metrics.accuracy_score(y_test,y_predict.round()): .6f}")
print(f"Actual recall_score : {metrics.recall_score(y_test,y_predict.round()): .4f}")
print(f"Actual precision_score : {metrics.precision_score(y_test,y_predict.round()): .4f}")
print(f"Actual f1_score : {metrics.f1_score(y_test,y_predict.round()): .4f}")