In [None]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


import warnings
warnings.filterwarnings("ignore")

In [None]:
file_path = '/content/drive/MyDrive/nad_dataset.pkl'
with open(file_path, 'rb') as f:
    df = pickle.load(f)

In [None]:
def target(x):
  if x == 'Normal':
    return 0
  else:
    return 1

df['target'] = df['attack_type'].apply(target)

In [None]:
df2=df.copy()
df2.drop(['num_outbound_cmds'], axis=1, inplace=True)
categorical_cols = ['protocol_type', 'service', 'flag','attack','attack_type','service_type']
numerical_cols = df2.loc[:, ~df2.columns.isin(categorical_cols)].columns

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in categorical_cols:
  df2[i] = le.fit_transform(df2[i])

In [None]:
scaler = MinMaxScaler()
for i in df2.columns[:-1]:
  df2[i] = scaler.fit_transform(df2[i].values.reshape(-1, 1))

In [None]:
df2.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,attack_type,service_type,target
0,0.0,0.5,0.289855,0.9,3.558064e-07,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05,0.0,0.5,0.952381,0.25,0.0,0
1,0.0,1.0,0.637681,0.9,1.057999e-07,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.714286,0.25,0.5,0
2,0.0,0.5,0.710145,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.409091,0.904762,0.0,0.5,1
3,0.0,0.5,0.347826,0.9,1.681203e-07,6.223962e-06,0.0,0.0,0.0,0.0,...,0.04,0.03,0.01,0.0,0.01,0.5,1.0,0.25,1.0,0
4,0.0,0.5,0.347826,0.9,1.442067e-07,3.20626e-07,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.25,1.0,0


In [None]:
corr_matrix = df2.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
threshold=0.8
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

to_drop

['num_root',
 'is_guest_login',
 'srv_serror_rate',
 'srv_rerror_rate',
 'dst_host_same_srv_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

In [None]:
df2.drop(to_drop, axis=1, inplace=True)

In [None]:
df2.shape

(125973, 36)

In [None]:
X = df2.drop(['attack','attack_type','target'], axis=1)
y = df2['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Logistic Regression

lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
y_pred2 = lr.predict(X_test)
lr_accuracy2 = accuracy_score(y_test, y_pred2)
cv_scores = cross_val_score(lr, X_train, y_train, cv=5)
print("Logistic Regression Accuracy:", lr_accuracy2)
print('-'*70)
print("MSE:", mean_squared_error(y_test, y_pred2))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred2)))
print("R2 Score:", r2_score(y_test, y_pred2))
print('-'*70)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred2))
print('-'*70)
print("Mean cross-validation score:", cv_scores.mean())

Logistic Regression Accuracy: 0.9761460607263346
----------------------------------------------------------------------
MSE: 0.02385393927366541
RMSE: 0.1544472054576107
R2 Score: 0.9041737581412495
----------------------------------------------------------------------
Confusion Matrix:
[[13066   356]
 [  245 11528]]
----------------------------------------------------------------------
Mean cross-validation score: 0.9756395200541703


In [None]:
# SVM

svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
cv_scores = cross_val_score(svm, X_train, y_train, cv=5)
svm_accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", svm_accuracy)
print('-'*70)
print(classification_report(y_test, y_pred))
print('-'*70)
print("Precision score:", precision_score(y_test, y_pred2))
print("Recall score:", recall_score(y_test, y_pred2))
print("F1 score:",f1_score(y_test, y_pred2))
print('-'*70)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print('-'*70)
print("Mean cross-validation score:", cv_scores.mean())


SVM Accuracy: 0.9946417940067473
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     13422
           1       0.99      1.00      0.99     11773

    accuracy                           0.99     25195
   macro avg       0.99      0.99      0.99     25195
weighted avg       0.99      0.99      0.99     25195

----------------------------------------------------------------------
Precision score: 0.9700437563110064
Recall score: 0.9791896712817464
F1 score: 0.9745952572177369
----------------------------------------------------------------------
Confusion Matrix:
[[13296   126]
 [    9 11764]]
----------------------------------------------------------------------
Mean cross-validation score: 0.9952668264687509


In [None]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()

param_grid = {
    'n_estimators': [5, 10, 15],
    'max_depth': [None, 2, 4,8],
    'min_samples_split': [None, 2, 5, 8],
    'min_samples_leaf': [None, 1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)

best_rf = grid_search.best_estimator_

test_accuracy = best_rf.score(X_test, y_test)
print("Test set accuracy: ", test_accuracy)


Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 15}
Test set accuracy:  0.998809287557055


In [None]:
# Random Forest

rf=RandomForestClassifier(n_estimators=15, max_depth=None, min_samples_leaf= 1, min_samples_split= 2, oob_score=True, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print('-'*70)
print(classification_report(y_test, y_pred))
print('-'*70)
print('-'*70)
print("Precision score:", precision_score(y_test, y_pred2))
print("Recall score:", recall_score(y_test, y_pred2))
print("F1 score:",f1_score(y_test, y_pred2))
print('-'*70)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print('-'*70)
oob_score = rf.oob_score_
print("OOB Score:", oob_score)
print('-'*70)
oob_error = 1-oob_score
print("OOB Error:", oob_error)

Random Forest Accuracy: 0.9991268108751736
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13422
           1       1.00      1.00      1.00     11773

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195

----------------------------------------------------------------------
----------------------------------------------------------------------
Precision score: 0.9700437563110064
Recall score: 0.9791896712817464
F1 score: 0.9745952572177369
----------------------------------------------------------------------
Confusion Matrix:
[[13413     9]
 [   13 11760]]
----------------------------------------------------------------------
OOB Score: 0.9981146678838635
----------------------------------------------------------------------
OOB Error: 0.00188533211613

In [None]:
#KNN

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(knn, X_train, y_train, cv=5)
print("KNN Accuracy:", knn_accuracy)
print('-'*70)
print(classification_report(y_test, y_pred))
print('-'*70)
print("Precision score:", precision_score(y_test, y_pred2))
print("Recall score:", recall_score(y_test, y_pred2))
print("F1 score:",f1_score(y_test, y_pred2))
print('-'*70)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print('-'*70)
print("Mean cross-validation score:", cv_scores.mean())


KNN Accuracy: 0.9981345505060528
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13422
           1       1.00      1.00      1.00     11773

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195

----------------------------------------------------------------------
Precision score: 0.9700437563110064
Recall score: 0.9791896712817464
F1 score: 0.9745952572177369
----------------------------------------------------------------------
Confusion Matrix:
[[13402    20]
 [   27 11746]]
----------------------------------------------------------------------
Mean cross-validation score: 0.9983825840901893


In [None]:
# Stacking Classifier

base_models = [('lr', lr), ('knn', knn), ('svc', svm), ('rf', rf)]

meta_model = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)

stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Ensemble Accuracy:", accuracy)
print('-'*70)
print(classification_report(y_test, y_pred))
print("Precision score:", precision_score(y_test, y_pred2))
print("Recall score:", recall_score(y_test, y_pred2))
print("F1 score:",f1_score(y_test, y_pred2))
print('-'*70)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Ensemble Accuracy: 0.9991665012899384
----------------------------------------------------------------------
Test Set Score: 0.9991665012899384
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13422
           1       1.00      1.00      1.00     11773

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195

----------------------------------------------------------------------
Confusion Matrix:
[[13413     9]
 [   12 11761]]


In [None]:
from sklearn.ensemble import RandomForestRegressor
X2 = df2.drop(['attack_type','attack','target'], axis=1)
y2 = df2['target']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=10)

feature_names = X_train.columns

model = RandomForestRegressor(n_estimators=100, random_state=10)
model.fit(X_train2, y_train2)

importances = model.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False).head(10)
print(feature_imp_df)

                   Feature  Gini Importance
3                     flag         0.587544
31               last_flag         0.136009
1            protocol_type         0.088974
28  dst_host_diff_srv_rate         0.081171
20               srv_count         0.021918
2                  service         0.014736
19                   count         0.013784
23           same_srv_rate         0.011285
26          dst_host_count         0.008473
27      dst_host_srv_count         0.006106


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
selectkbest = SelectKBest(score_func=mutual_info_classif, k=10)
sfit = selectkbest.fit(X_train2,y_train2)
selected_features1 = [X_train2.columns[i] for i in sfit.get_support(indices=True)]
print("Selected Features:", selected_features1)

Selected Features: ['service', 'flag', 'src_bytes', 'dst_bytes', 'logged_in', 'serror_rate', 'same_srv_rate', 'diff_srv_rate', 'dst_host_srv_count', 'dst_host_diff_srv_rate']


In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
n_features_to_select = 10
rfe = RFE(estimator=rf_model, n_features_to_select=n_features_to_select)
fit = rfe.fit(X_train2, y_train2)
selected_features3 = [X_train2.columns[i] for i in range(len(rfe.support_)) if rfe.support_[i]]
print("Selected Features:", selected_features3)

Selected Features: ['protocol_type', 'flag', 'logged_in', 'count', 'same_srv_rate', 'diff_srv_rate', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'last_flag', 'service_type']


In [None]:
final_features = ['protocol_type', 'flag', 'src_bytes', 'dst_bytes', 'count', 'same_srv_rate', 'serror_rate', 'dst_host_srv_count', 'logged_in','last_flag']

In [None]:
list(df.flag.unique())

['SF', 'S0', 'REJ', 'RSTR', 'SH', 'RSTO', 'S1', 'RSTOS0', 'S3', 'S2', 'OTH']

In [None]:
X_train_features.flag.value_counts()

Unnamed: 0_level_0,count
flag,Unnamed: 1_level_1
0.9,59960
0.5,27882
0.1,8978
0.4,1941
0.2,1270
0.6,284
1.0,202
0.7,102
0.3,83
0.0,40


In [None]:
df.flag.value_counts()

Unnamed: 0_level_0,count
flag,Unnamed: 1_level_1
SF,74945
S0,34851
REJ,11233
RSTR,2421
RSTO,1562
S1,365
SH,271
S2,127
RSTOS0,103
S3,49


In [None]:
X_train_features[:2]

Unnamed: 0,protocol_type,flag,src_bytes,dst_bytes,count,same_srv_rate,serror_rate,dst_host_srv_count,logged_in,last_flag
95141,0.5,0.9,1.550765e-07,1.1e-05,0.031311,1.0,0.0,1.0,1.0,1.0
37486,0.5,0.5,0.0,0.0,0.277886,0.01,1.0,0.007843,0.0,1.0


In [None]:
{'protocol_type':'tcp',
 'flag':'SF',
 'src_bytes':1.550765e-07,
 'dst_bytes':0.000011,
 'count':0.031311,
 'same_srv_rate':1.00,
 'serror_rate':0.0	,
 'dst_host_srv_count':1.000000,
 'logged_in':1.0,
 'last_flag':1.0
 }

In [None]:
X_train_features = X_train[final_features]
X_test_features = X_test[final_features]

In [None]:
print(X_train_features.shape)
print(y_train.shape)
print(X_test_features.shape)
print(y_test.shape)



(100778, 10)
(100778,)
(25195, 10)
(25195,)


In [None]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_features, y_train)
y_pred = rf_model.predict(X_test_features)
rf_accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print('-'*70)
print(classification_report(y_test, y_pred))
print('-'*70)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Accuracy: 0.9977376463584045
----------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13422
           1       1.00      1.00      1.00     11773

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195

----------------------------------------------------------------------
Confusion Matrix:
[[13386    36]
 [   21 11752]]


In [None]:
with open('final_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
    f.close()
    print("Model saved successfully!")



Model saved successfully!
