In [483]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [484]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [485]:
# Load the CSV file into a DataFrame
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0,duration,protocol_type,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,target,Attack Type
0,0,1,0,181,5450,0,0,0,0,0,...,1.0,0.0,0.0,9,9,0.0,0.11,0.0,normal.,normal
1,0,1,0,239,486,0,0,0,0,0,...,1.0,0.0,0.0,19,19,0.0,0.05,0.0,normal.,normal
2,0,1,0,235,1337,0,0,0,0,0,...,1.0,0.0,0.0,29,29,0.0,0.03,0.0,normal.,normal
3,0,1,0,219,1337,0,0,0,0,0,...,1.0,0.0,0.0,39,39,0.0,0.03,0.0,normal.,normal
4,0,1,0,217,2032,0,0,0,0,0,...,1.0,0.0,0.0,49,49,0.0,0.02,0.0,normal.,normal


In [486]:
df['target'].unique()

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [487]:
corr_matrix = df.iloc[:, :-2].corr()

# Filter the correlation matrix to show only values greater than 0.8
high_corr = corr_matrix[corr_matrix > 0.8]

# Remove self-correlation (correlation of a column with itself) by setting diagonal values to NaN
for i in range(len(high_corr)):
    high_corr.iat[i, i] = None

# Drop rows and columns where there are no values > 0.8
high_corr_filtered = high_corr.dropna(how='all').dropna(axis=1, how='all')

# Display the columns with correlation > 0.8
high_corr_filtered

Unnamed: 0,hot,is_guest_login,count,srv_count,same_srv_rate,dst_host_srv_count,dst_host_same_src_port_rate
hot,,0.843572,,,,,
is_guest_login,0.843572,,,,,,
count,,,,0.943667,,,0.860579
srv_count,,,0.943667,,,,0.944926
same_srv_rate,,,,,,0.898955,
dst_host_srv_count,,,,,0.898955,,
dst_host_same_src_port_rate,,,0.860579,0.944926,,,


In [488]:
df.drop(['is_guest_login','srv_count','dst_host_srv_count'],axis=1,inplace=True)
df.reset_index(drop=True,inplace=True)

In [489]:
df['Attack Type'].value_counts()

Attack Type
dos       391458
normal     97278
probe       4107
r2l         1126
u2r           52
Name: count, dtype: int64

In [490]:
df.isnull().sum().sum()

0

In [491]:
 #Checking for data imbalance
from sklearn.utils import resample

# Separate majority and minority classes
data_majority = df[df['Attack Type'] == 'dos']
data_minority_5 = df[df['Attack Type'] == 'normal']
data_minority_2 = df[df['Attack Type'] == 'probe']
data_minority_3 = df[df['Attack Type'] == 'r2l']
data_minority_4 = df[df['Attack Type'] == 'u2r']

data_minority_5_upsampled = resample(data_minority_5,replace=True,n_samples=len(data_majority),random_state=42)
data_minority_2_upsampled = resample(data_minority_2,replace=True,n_samples=len(data_majority), random_state=42)
data_minority_3_upsampled = resample(data_minority_3,replace=True,n_samples=len(data_majority), random_state=42)
data_minority_4_upsampled = resample(data_minority_4,replace=True,n_samples=len(data_majority), random_state=42)


data_balanced = pd.concat([data_majority,data_minority_5_upsampled,data_minority_2_upsampled,data_minority_3_upsampled,data_minority_4_upsampled])
#print(data_balanced)

print(data_balanced['Attack Type'].value_counts())

Attack Type
dos       391458
normal    391458
probe     391458
r2l       391458
u2r       391458
Name: count, dtype: int64


In [492]:
df['Attack Type'].isnull().sum()

0

In [493]:
df['Combined'] = df['target'].astype(str) + "_" + df['Attack Type'].astype(str)

# Initialize the Label Encoder
label = LabelEncoder()

# Encode the combined labels
df['Encoded'] = label.fit_transform(df['Combined'])

# Check the encoded labels
print("Encoded labels:", df['Encoded'].unique())

# Prepare features (X) and target variable (y)
X = df.drop(['target', 'Attack Type', 'Combined', 'Encoded'], axis=1)  # Drop the original and combined columns
y = df['Encoded']  # Use the encoded column as the target variable

# Scale features
sc = MinMaxScaler()
X = sc.fit_transform(X)

# Split test and train data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("Training data shape:", X_train.shape, y_train.shape)
print("Test data shape:", X_test.shape, y_test.shape)

Encoded labels: [11  1  7 12  9 18  3 14 20 15  5  6  2  0  4 17 13 10  8 22 21 19 16]
Training data shape: (330994, 29) (330994,)
Test data shape: (163027, 29) (163027,)


In [494]:
df['Attack Type'].unique()

array(['normal', 'u2r', 'dos', 'r2l', 'probe'], dtype=object)

In [495]:
# Get all mappings
attack_mapping = {index: name for index, name in enumerate(label.classes_)}
print("Mapping of encoded values to original values:", attack_mapping)

Mapping of encoded values to original values: {0: 'back._dos', 1: 'buffer_overflow._u2r', 2: 'ftp_write._r2l', 3: 'guess_passwd._r2l', 4: 'imap._r2l', 5: 'ipsweep._probe', 6: 'land._dos', 7: 'loadmodule._u2r', 8: 'multihop._r2l', 9: 'neptune._dos', 10: 'nmap._probe', 11: 'normal._normal', 12: 'perl._u2r', 13: 'phf._r2l', 14: 'pod._dos', 15: 'portsweep._probe', 16: 'rootkit._u2r', 17: 'satan._probe', 18: 'smurf._dos', 19: 'spy._r2l', 20: 'teardrop._dos', 21: 'warezclient._r2l', 22: 'warezmaster._r2l'}


In [496]:
df

Unnamed: 0,duration,protocol_type,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,target,Attack Type,Combined,Encoded
0,0,1,0,181,5450,0,0,0,0,0,...,0.0,0.00,9,0.0,0.11,0.00,normal.,normal,normal._normal,11
1,0,1,0,239,486,0,0,0,0,0,...,0.0,0.00,19,0.0,0.05,0.00,normal.,normal,normal._normal,11
2,0,1,0,235,1337,0,0,0,0,0,...,0.0,0.00,29,0.0,0.03,0.00,normal.,normal,normal._normal,11
3,0,1,0,219,1337,0,0,0,0,0,...,0.0,0.00,39,0.0,0.03,0.00,normal.,normal,normal._normal,11
4,0,1,0,217,2032,0,0,0,0,0,...,0.0,0.00,49,0.0,0.02,0.00,normal.,normal,normal._normal,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,1,0,310,1881,0,0,0,0,0,...,0.0,0.40,86,0.0,0.01,0.05,normal.,normal,normal._normal,11
494017,0,1,0,282,2286,0,0,0,0,0,...,0.0,0.00,6,0.0,0.17,0.05,normal.,normal,normal._normal,11
494018,0,1,0,203,1200,0,0,0,0,0,...,0.0,0.17,16,0.0,0.06,0.05,normal.,normal,normal._normal,11
494019,0,1,0,291,1200,0,0,0,0,0,...,0.0,0.17,26,0.0,0.04,0.05,normal.,normal,normal._normal,11


In [497]:
def calculate_tp_tn_fp_fn(confusion_matrix):
    """
    Function to calculate True Positives (TP), True Negatives (TN),
    False Positives (FP), and False Negatives (FN) from a given confusion matrix.

    Args:
    confusion_matrix (numpy.ndarray): The confusion matrix (multiclass)

    Returns:
    dict: A dictionary containing TP, TN, FP, and FN values
    """
    # True Negatives (TN): Normal correctly classified as Normal (Class 0 classified as Class 0)
    TN = confusion_matrix[0, 0]

    # False Positives (FP): Normal misclassified as any Attack (Class 0 misclassified as Class 1-4)
    FP = confusion_matrix[0, 1:].sum()

    # True Positives (TP): Attack correctly classified as Attack (Class 1-4 classified as Class 1-4)
    TP = confusion_matrix[1:, 1:].sum()

    # False Negatives (FN): Attack misclassified as Normal (Class 1-4 misclassified as Class 0)
    FN = confusion_matrix[1:, 0].sum()

    return {
        'True Positives (TP)': TP,
        'True Negatives (TN)': TN,
        'False Positives (FP)': FP,
        'False Negatives (FN)': FN
    }

In [498]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, f1_score

In [499]:
clfb = GaussianNB()
start_time_train= time.time()

In [500]:
clfb.fit(X_train, y_train)
end_time_train= time.time()

In [501]:
print(f"Training time: {end_time_train - start_time_train:.4f} seconds")

Training time: 0.6181 seconds


In [502]:
start_time_pred_train= time.time()
y_train_pred=clfb.predict(X_train)
end_time_pred_train= time.time()

In [503]:
start_time_pred_test = time.time()
y_test_pred = clfb.predict(X_test)
end_time_pred_test = time.time()

In [504]:
print(f"Prediction time on test data: {end_time_pred_test - start_time_pred_test:.4f} seconds")


Prediction time on test data: 2.1244 seconds


In [505]:
print("Train score is:", clfb.score(X_train, y_train))
print("Test score is:",clfb.score(X_test,y_test))

Train score is: 0.8569460473603752
Test score is: 0.8567660571561766


In [506]:


# Calculate metrics
precision_gaus = precision_score(y_test, y_test_pred, average='weighted')
accuracy_gaus= accuracy_score(y_test, y_test_pred)
recall_gaus= recall_score(y_test, y_test_pred, average='weighted')
f1_gaus= f1_score(y_test, y_test_pred, average='weighted')
cm_gaus= confusion_matrix(y_test, y_test_pred)
r=calculate_tp_tn_fp_fn(cm_gaus)

# Display results
print(f"Precision: {precision_gaus}")
print(f"Accuracy: {accuracy_gaus}")
print(f"Recall: {recall_gaus}")
print(f"F1 Score: {f1_gaus}")
#print(f"Confusion Matrix:\n{cm_gaus}")
print(r)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.991411704410934
Accuracy: 0.8567660571561766
Recall: 0.8567660571561766
F1 Score: 0.8849105459621383
{'True Positives (TP)': 162261, 'True Negatives (TN)': 716, 'False Positives (FP)': 6, 'False Negatives (FN)': 44}


In [507]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
clfd = DecisionTreeClassifier(criterion="entropy", max_depth=4)

# Train the model
start_time = time.time()
clfd.fit(X_train, y_train)
end_time = time.time()
print("Training time: ", end_time - start_time)

# Predict using the training data
y_train_pred = clfd.predict(X_train)

# Predict using the test data
y_test_pred = clfd.predict(X_test)

#Calculate and display metrics for training and testing
train_accuracy = clfd.score(X_train, y_train)
test_accuracy = clfd.score(X_test, y_test)
print("Train score is:", train_accuracy)
print("Test score is:", test_accuracy)

# Calculate metrics
precision_dec = precision_score(y_test, y_test_pred, average='weighted')
accuracy_dec = accuracy_score(y_test, y_test_pred)
recall_dec = recall_score(y_test, y_test_pred, average='weighted')
f1_dec = f1_score(y_test, y_test_pred, average='weighted')
cm_dec = confusion_matrix(y_test, y_test_pred)
r = calculate_tp_tn_fp_fn(cm_dec)

# Display results
print(f"Precision: {precision_dec}")
print(f"Accuracy: {accuracy_dec}")
print(f"Recall: {recall_dec}")
print(f"F1 Score: {f1_dec}")
#print(f"Confusion Matrix:\n{cm_dec}")
print(r)


Training time:  1.435743808746338
Train score is: 0.9870209127657903
Test score is: 0.9864378293166163
Precision: 0.9792226975685764
Accuracy: 0.9864378293166163
Recall: 0.9864378293166163
F1 Score: 0.9815385800393719
{'True Positives (TP)': 162288, 'True Negatives (TN)': 720, 'False Positives (FP)': 2, 'False Negatives (FN)': 17}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [508]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494021 entries, 0 to 494020
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494021 non-null  int64  
 1   protocol_type                494021 non-null  int64  
 2   flag                         494021 non-null  int64  
 3   src_bytes                    494021 non-null  int64  
 4   dst_bytes                    494021 non-null  int64  
 5   land                         494021 non-null  int64  
 6   wrong_fragment               494021 non-null  int64  
 7   urgent                       494021 non-null  int64  
 8   hot                          494021 non-null  int64  
 9   num_failed_logins            494021 non-null  int64  
 10  logged_in                    494021 non-null  int64  
 11  num_compromised              494021 non-null  int64  
 12  root_shell                   494021 non-null  int64  
 13 

In [509]:
#RandomForest
from sklearn.ensemble import RandomForestClassifier

In [510]:
clfr = RandomForestClassifier(n_estimators=100,max_depth=10,random_state=42)

In [511]:
start_time_train = time.time()
clfr.fit(X_train, y_train)
end_time_train = time.time()
print(f"Training time: {end_time_train - start_time_train:.4f} seconds")

Training time: 27.9190 seconds


In [512]:
start_time_pred_train = time.time()
y_train_pred = clfr.predict(X_train)
end_time_pred_train = time.time()
print(f"Prediction time on training data: {end_time_pred_train - start_time_pred_train:.4f} seconds")

Prediction time on training data: 7.3904 seconds


In [513]:
start_time_pred_test = time.time()
y_test_pred = clfr.predict(X_test)
end_time_pred_test = time.time()
print(f"Prediction time on test data: {end_time_pred_test - start_time_pred_test:.4f} seconds")

Prediction time on test data: 3.6318 seconds


In [514]:
# Train and Test scores
print("Train score is:", clfr.score(X_train, y_train))
print("Test score is:", clfr.score(X_test, y_test))

Train score is: 0.9994803531181834
Test score is: 0.9992455237476001


In [515]:
# Calculate metrics
precision_rand = precision_score(y_test, y_test_pred, average='weighted')
accuracy_rand = accuracy_score(y_test, y_test_pred)
recall_rand = recall_score(y_test, y_test_pred, average='weighted')
f1_rand = f1_score(y_test, y_test_pred, average='weighted')
cm_rand = confusion_matrix(y_test, y_test_pred)
r = calculate_tp_tn_fp_fn(cm_rand)

# Display results
print(f"Precision: {precision_rand:.4f}")
print(f"Accuracy: {accuracy_rand:.4f}")
print(f"Recall: {recall_rand:.4f}")
print(f"F1 Score: {f1_rand:.4f}")
#print(f"Confusion Matrix:\n{cm_rand}")
print(r)

Precision: 0.9992
Accuracy: 0.9992
Recall: 0.9992
F1 Score: 0.9992
{'True Positives (TP)': 162305, 'True Negatives (TN)': 721, 'False Positives (FP)': 1, 'False Negatives (FN)': 0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [516]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\sreev\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [517]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Initialize XGBoost Classifier (no need for use_label_encoder, it's deprecated)
clfg = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, eval_metric='mlogloss', random_state=42)

# Encode target labels (y_train) from string to numeric
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test) 

In [518]:
import time

# Record the start time for training
start_time = time.time()
# Train the model
clfg.fit(X_train, y_train_encoded)
end_time = time.time()
print(f"Training time: {end_time - start_time:.4f} seconds")


Training time: 57.8549 seconds


In [519]:
# Time the prediction process
start_time = time.time()
y_test_pred_xgb_encoded = clfg.predict(X_test)
end_time = time.time()
print(f"Testing time: {end_time - start_time:.4f} seconds")

Testing time: 1.2120 seconds


In [520]:
y_test_pred_xgb = label_encoder.inverse_transform(y_test_pred_xgb_encoded)

In [521]:
# Calculate performance metrics
precision_xgb = precision_score(y_test, y_test_pred_xgb, average='weighted')
accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
recall_xgb = recall_score(y_test, y_test_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_test_pred_xgb, average='weighted')
cm_xgb = confusion_matrix(y_test, y_test_pred_xgb)

# Assuming calculate_tp_tn_fp_fn function is defined to take confusion matrix as input
r = calculate_tp_tn_fp_fn(cm_xgb)

# Display results
print(f"Precision (XGBoost): {precision_xgb:.4f}")
print(f"Accuracy (XGBoost): {accuracy_xgb:.4f}")
print(f"Recall (XGBoost): {recall_xgb:.4f}")
print(f"F1 Score (XGBoost): {f1_xgb:.4f}")
#print(f"Confusion Matrix (XGBoost):\n{cm_xgb}")
print(r)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision (XGBoost): 0.9997
Accuracy (XGBoost): 0.9997
Recall (XGBoost): 0.9997
F1 Score (XGBoost): 0.9997
{'True Positives (TP)': 162305, 'True Negatives (TN)': 722, 'False Positives (FP)': 0, 'False Negatives (FN)': 0}


In [522]:
from tabulate import tabulate

# Data for all the models with precomputed values
data = [
    ["Random Forest",accuracy_rand,precision_rand,recall_rand,f1_rand],
    ["Decision Tree",accuracy_dec,precision_rand,recall_dec,f1_dec],
    ["Naive Bayes",accuracy_gaus,precision_gaus,recall_gaus,f1_gaus],
    ["Gradient Boosting",precision_xgb,accuracy_xgb,recall_xgb,f1_xgb]
]

# Define column names
col_names = ["Model", "Accuracy", "Precision", "Recall", "F1 Score"]

# Print the table
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))


╒═══════════════════╤════════════╤═════════════╤══════════╤════════════╕
│ Model             │   Accuracy │   Precision │   Recall │   F1 Score │
╞═══════════════════╪════════════╪═════════════╪══════════╪════════════╡
│ Random Forest     │   0.999246 │    0.999177 │ 0.999246 │   0.999188 │
├───────────────────┼────────────┼─────────────┼──────────┼────────────┤
│ Decision Tree     │   0.986438 │    0.999177 │ 0.986438 │   0.981539 │
├───────────────────┼────────────┼─────────────┼──────────┼────────────┤
│ Naive Bayes       │   0.856766 │    0.991412 │ 0.856766 │   0.884911 │
├───────────────────┼────────────┼─────────────┼──────────┼────────────┤
│ Gradient Boosting │   0.999733 │    0.999736 │ 0.999736 │   0.999729 │
╘═══════════════════╧════════════╧═════════════╧══════════╧════════════╛


In [523]:
print(type(clfr), clfr)
print(type(clfg), clfg)
print(type(clfd), clfd)
print(type(clfb), clfb)


<class 'sklearn.ensemble._forest.RandomForestClassifier'> RandomForestClassifier(max_depth=10, random_state=42)
<class 'xgboost.sklearn.XGBClassifier'> XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, objective='multi:softprob', ...)
<class 'sklearn.tree._classes.DecisionTreeClassifier'> DecisionTreeClassifier(cr

In [524]:
import joblib

try:
    # Save the trained models
    joblib.dump(clfr, 'random_forest_model.pkl')
    joblib.dump(clfg, 'gradient_boosting_model.pkl')
    joblib.dump(clfd, 'decision_tree_model.pkl')
    joblib.dump(clfb, 'naive_bayes_model.pkl')
    print("Models saved successfully!")
except Exception as e:
    print(f"Error saving models: {e}")


Models saved successfully!
