## 1. Read the article and reproduce the results (Accuracy, Precision, Recall, F-Measure) for NSL-KDD dataset using following classification methods:
## ● SVM Linear
## ● SVM Quadratic
## ● SVM Cubic
## ● KNN Fine
## ● KNN Medium
## ● KNN Cubic
## ● TREE Fine
## ● TREE Medium
## These results can be found in Table 4 of the manuscript and should be used for comparison purposes, if required. Write a report summarising the dataset, used ML methods, experiment protocol and results including variations, if any. During reproducing the results:
## i) you should use the same set of features used by the authors.
## ii) you should use the same classifier with exact parameter values.
## iii) you should use the same training/test splitting approach as used by the authors.
## iv) you should use the same pre/post processing, if any, used by the authors.

In [1]:
# **Dataset Preprocessing:** Used the "FieldNames.pdf" document to preprocess the "Intrusion_detection_NSL_KDD.csv" dataset. 
# This involves cleaning the data, normalizing features, and encoding categorical variables.

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
dataset = pd.read_csv("Intrusion_detection_NSL_KDD.csv")
dataset.dropna(inplace=True)

In [4]:
continuous_features = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised',
                       'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
                       'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
                       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
                       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                       'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
scaler = StandardScaler()
dataset[continuous_features] = scaler.fit_transform(dataset[continuous_features])

In [5]:
categorical_features = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
label_encoder = LabelEncoder()
for feature in categorical_features:
    dataset[feature] = label_encoder.fit_transform(dataset[feature])

In [6]:
attack_mapping = {
    'back': 'dos', 'land': 'dos', 'neptune': 'dos', 'pod': 'dos', 'smurf': 'dos', 'teardrop': 'dos', 'mailbomb': 'dos', 'apache2': 'dos',
    'processtable': 'dos', 'udpstorm': 'dos', 'worm': 'dos', 'ipsweep': 'probe', 'nmap': 'probe', 'portsweep': 'probe', 'satan': 'probe',
    'mscan': 'probe', 'saint': 'probe', 'ftp_write': 'r2l', 'guess_passwd': 'r2l', 'imap': 'r2l', 'multihop': 'r2l', 'phf': 'r2l',
    'spy': 'r2l', 'warezclient': 'r2l', 'warezmaster': 'r2l', 'sendmail': 'r2l', 'named': 'r2l', 'snmpgetattack': 'r2l',
    'snmpguess': 'r2l', 'xlock': 'r2l', 'xsnoop': 'r2l', 'httptunnel': 'r2l', 'buffer_overflow': 'u2r', 'loadmodule': 'u2r',
    'perl': 'u2r', 'rootkit': 'u2r', 'ps': 'u2r', 'sqlattack': 'u2r', 'xterm': 'u2r', 'attack': 'u2r'
}
dataset['attack_type'] = dataset['attack_type'].map(attack_mapping)

In [7]:
dataset.to_csv("preprocessed_dataset.csv", index=False)

In [8]:
## **Reproduce Results:** For each of the following classification methods mentioned in the article, 
## reproduced the reported results (Accuracy, Precision, Recall, F-Measure):
##    - SVM Linear
##    - SVM Quadratic
##    - SVM Cubic
##    - KNN Fine
##    - KNN Medium
##    - KNN Cubic
##    - TREE Fine
##    - TREE Medium


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('preprocessed_dataset.csv')

In [10]:
data = data.dropna(subset=['attack_type'])
X = data.drop(columns=['attack_type'])
y = data['attack_type']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
classifiers = {
    'SVM Linear': SVC(kernel='linear', random_state=42),
    'SVM Quadratic': SVC(kernel='poly', degree=2, random_state=42),
    'SVM Cubic': SVC(kernel='poly', degree=3, random_state=42),
    'KNN Fine': KNeighborsClassifier(n_neighbors=5, weights='distance'),
    'KNN Medium': KNeighborsClassifier(n_neighbors=10, weights='uniform'),
    'KNN Cubic': KNeighborsClassifier(n_neighbors=20, weights='distance'),
    'TREE Fine': DecisionTreeClassifier(max_depth=10, random_state=42),
    'TREE Medium': DecisionTreeClassifier(max_depth=20, random_state=42),
}

In [14]:
results = {}
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

In [15]:
for name, scores in results.items():
    print(f'{name}:')
    print(f'Accuracy: {scores["Accuracy"]:.4f}')
    print(f'Precision: {scores["Precision"]:.4f}')
    print(f'Recall: {scores["Recall"]:.4f}')
    print(f'F1: {scores["F1"]:.4f}')
    print('-' * 20)

SVM Linear:
Accuracy: 0.9933
Precision: 0.9932
Recall: 0.9933
F1: 0.9932
--------------------
SVM Quadratic:
Accuracy: 0.9948
Precision: 0.9947
Recall: 0.9948
F1: 0.9947
--------------------
SVM Cubic:
Accuracy: 0.9943
Precision: 0.9942
Recall: 0.9943
F1: 0.9942
--------------------
KNN Fine:
Accuracy: 0.9969
Precision: 0.9969
Recall: 0.9969
F1: 0.9969
--------------------
KNN Medium:
Accuracy: 0.9962
Precision: 0.9962
Recall: 0.9962
F1: 0.9962
--------------------
KNN Cubic:
Accuracy: 0.9967
Precision: 0.9967
Recall: 0.9967
F1: 0.9967
--------------------
TREE Fine:
Accuracy: 0.9966
Precision: 0.9967
Recall: 0.9966
F1: 0.9966
--------------------
TREE Medium:
Accuracy: 0.9969
Precision: 0.9969
Recall: 0.9969
F1: 0.9969
--------------------


In [16]:
import pandas as pd

data = pd.read_csv('preprocessed_dataset.csv')

In [17]:
features_used = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
                 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
                 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
                 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
                 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
                 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
                 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
                 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']

X = data[features_used]
y = data['attack_type']

In [18]:
clean_data = pd.concat([X, y], axis=1).dropna()
X = clean_data[features_used]
y = clean_data['attack_type']

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

classifier = SVC(kernel='linear', random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [22]:
print('Accuracy:', accuracy)

Accuracy: 0.993283425453019


In [23]:
## **Comparison and Analysis:** Once we've reproduced the results, compare them with the reported results in Table 4 of the 
## article.

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [25]:
data = pd.read_csv('preprocessed_dataset.csv')
data = data.dropna(subset=['attack_type'])
X = data.drop(columns=['attack_type'])
y = data['attack_type']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
classifiers = {
    'SVM Linear': SVC(kernel='linear', random_state=42),
    'SVM Quadratic': SVC(kernel='poly', degree=2, random_state=42),
    'SVM Cubic': SVC(kernel='poly', degree=3, random_state=42),
    'KNN Fine': KNeighborsClassifier(n_neighbors=5, weights='distance'),
    'KNN Medium': KNeighborsClassifier(n_neighbors=10, weights='uniform'),
    'KNN Cubic': KNeighborsClassifier(n_neighbors=20, weights='distance'),
    'TREE Fine': DecisionTreeClassifier(max_depth=10, random_state=42),
    'TREE Medium': DecisionTreeClassifier(max_depth=20, random_state=42),
}

In [29]:
results = {}
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

In [30]:
print("Classification Results:")
print("=" * 30)
for name, scores in results.items():
    print(f'{name}:')
    print(f'Accuracy: {scores["Accuracy"]:.4f}')
    print(f'Precision: {scores["Precision"]:.4f}')
    print(f'Recall: {scores["Recall"]:.4f}')
    print(f'F1: {scores["F1"]:.4f}')
    print('-' * 20)

Classification Results:
SVM Linear:
Accuracy: 0.9933
Precision: 0.9932
Recall: 0.9933
F1: 0.9932
--------------------
SVM Quadratic:
Accuracy: 0.9948
Precision: 0.9947
Recall: 0.9948
F1: 0.9947
--------------------
SVM Cubic:
Accuracy: 0.9943
Precision: 0.9942
Recall: 0.9943
F1: 0.9942
--------------------
KNN Fine:
Accuracy: 0.9969
Precision: 0.9969
Recall: 0.9969
F1: 0.9969
--------------------
KNN Medium:
Accuracy: 0.9962
Precision: 0.9962
Recall: 0.9962
F1: 0.9962
--------------------
KNN Cubic:
Accuracy: 0.9967
Precision: 0.9967
Recall: 0.9967
F1: 0.9967
--------------------
TREE Fine:
Accuracy: 0.9966
Precision: 0.9967
Recall: 0.9966
F1: 0.9966
--------------------
TREE Medium:
Accuracy: 0.9969
Precision: 0.9969
Recall: 0.9969
F1: 0.9969
--------------------


In [31]:
reported_results = {
    'SVM Linear': {
        'Accuracy': 0.9937,
        'Precision': 0.8521,
        'Recall': 0.8521,
        'F1': 0.8534,
    },
    'SVM Quadratic': {
        'Accuracy': 0.9962,
        'Precision': 0.9668,
        'Recall': 0.9668,
        'F1': 0.9346,
    },
    'SVM Cubic': {
        'Accuracy': 0.9955,
        'Precision': 0.9382,
        'Recall': 0.9382,
        'F1': 0.922,
    },
    'KNN Fine': {
        'Accuracy': 0.9546,
        'Precision': 0.8239,
        'Recall': 0.8239,
        'F1': 0.8182,
    },
    'KNN Medium': {
        'Accuracy': 0.9479,
        'Precision': 0.8138,
        'Recall': 0.8138,
        'F1': 0.7942,
    },
    'KNN Cubic': {
        'Accuracy': 0.9448,
        'Precision': 0.8103,
        'Recall': 0.8103,
        'F1': 0.7905,
    },
    'TREE Fine': {
        'Accuracy': 0.9984,
        'Precision': 0.979,
        'Recall': 0.979,
        'F1': 0.968,
    },
    'TREE Medium': {
        'Accuracy': 0.9984,
        'Precision': 0.9942,
        'Recall': 0.9942,
        'F1': 0.9754,
    },
}

print("\nComparison with Reported Results:")
print("=" * 30)
for name, reported_scores in reported_results.items():
    print(f'{name}:')
    for metric, reported_score in reported_scores.items():
        difference = abs(results[name][metric] - reported_score)
        print(f'{metric}:')
        print(f'   Reproduced: {results[name][metric]:.4f}')
        print(f'   Reported:   {reported_score:.4f}')
        print(f'   Difference:  {difference:.4f}')
        print('-' * 20)



Comparison with Reported Results:
SVM Linear:
Accuracy:
   Reproduced: 0.9933
   Reported:   0.9937
   Difference:  0.0004
--------------------
Precision:
   Reproduced: 0.9932
   Reported:   0.8521
   Difference:  0.1411
--------------------
Recall:
   Reproduced: 0.9933
   Reported:   0.8521
   Difference:  0.1412
--------------------
F1:
   Reproduced: 0.9932
   Reported:   0.8534
   Difference:  0.1398
--------------------
SVM Quadratic:
Accuracy:
   Reproduced: 0.9948
   Reported:   0.9962
   Difference:  0.0014
--------------------
Precision:
   Reproduced: 0.9947
   Reported:   0.9668
   Difference:  0.0279
--------------------
Recall:
   Reproduced: 0.9948
   Reported:   0.9668
   Difference:  0.0280
--------------------
F1:
   Reproduced: 0.9947
   Reported:   0.9346
   Difference:  0.0601
--------------------
SVM Cubic:
Accuracy:
   Reproduced: 0.9943
   Reported:   0.9955
   Difference:  0.0012
--------------------
Precision:
   Reproduced: 0.9942
   Reported:   0.9382
   D

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [33]:
data = pd.read_csv('preprocessed_dataset.csv')
features_used = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
                 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
                 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
                 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
                 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
                 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
                 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
                 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']

X = data[features_used]
y = data['attack_type']
clean_data = pd.concat([X, y], axis=1).dropna()
X = clean_data[features_used]
y = clean_data['attack_type']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
classifier = SVC(kernel='linear', random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.993283425453019


In [37]:
reported_results = {
    'SVM Linear': {
        'Accuracy': 0.9964,
        'Precision': 0.9864,
        'Recall': 0.9864,
        'F1': 0.9803,
    },
    'SVM Quadratic': {
        'Accuracy': 0.9981,
        'Precision': 0.9941,
        'Recall': 0.9941,
        'F1': 0.9935,
    },
    'SVM Cubic': {
        'Accuracy': 0.9913,
        'Precision': 0.9876,
        'Recall': 0.9876,
        'F1': 0.9858,
    },
}

print("\nComparison with Reported Results:")
print("=" * 30)
for name, reported_scores in reported_results.items():
    print(f'{name}:')
    for metric, reported_score in reported_scores.items():
        difference = abs(accuracy - reported_score)
        print(f'{metric}:')
        print(f'   Reproduced: {accuracy:.4f}')
        print(f'   Reported:   {reported_score:.4f}')
        print(f'   Difference:  {difference:.4f}')
        print('-' * 20)


Comparison with Reported Results:
SVM Linear:
Accuracy:
   Reproduced: 0.9933
   Reported:   0.9964
   Difference:  0.0031
--------------------
Precision:
   Reproduced: 0.9933
   Reported:   0.9864
   Difference:  0.0069
--------------------
Recall:
   Reproduced: 0.9933
   Reported:   0.9864
   Difference:  0.0069
--------------------
F1:
   Reproduced: 0.9933
   Reported:   0.9803
   Difference:  0.0130
--------------------
SVM Quadratic:
Accuracy:
   Reproduced: 0.9933
   Reported:   0.9981
   Difference:  0.0048
--------------------
Precision:
   Reproduced: 0.9933
   Reported:   0.9941
   Difference:  0.0008
--------------------
Recall:
   Reproduced: 0.9933
   Reported:   0.9941
   Difference:  0.0008
--------------------
F1:
   Reproduced: 0.9933
   Reported:   0.9935
   Difference:  0.0002
--------------------
SVM Cubic:
Accuracy:
   Reproduced: 0.9933
   Reported:   0.9913
   Difference:  0.0020
--------------------
Precision:
   Reproduced: 0.9933
   Reported:   0.9876
   D

## **Solution Design:** Devise a novel ML solution for the intrusion detection problem using the NSL-KDD dataset. This solution should substantially differ from the methods presented in the article. You can choose to modify feature selection approaches, parameter optimization processes, or even select different ML algorithms.


In [38]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dataset = pd.read_csv("Intrusion_detection_NSL_KDD.csv")
dataset.dropna(inplace=True)

continuous_features = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised',
                       'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
                       'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
                       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
                       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                       'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
scaler = StandardScaler()
dataset[continuous_features] = scaler.fit_transform(dataset[continuous_features])

categorical_features = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
label_encoder = LabelEncoder()
for feature in categorical_features:
    dataset[feature] = label_encoder.fit_transform(dataset[feature])

attack_mapping = {
        'back': 'dos', 'land': 'dos', 'neptune': 'dos', 'pod': 'dos', 'smurf': 'dos', 'teardrop': 'dos', 'mailbomb': 'dos', 'apache2': 'dos',
        'processtable': 'dos', 'udpstorm': 'dos', 'worm': 'dos', 'ipsweep': 'probe', 'nmap': 'probe', 'portsweep': 'probe', 'satan': 'probe',
        'mscan': 'probe', 'saint': 'probe', 'ftp_write': 'r2l', 'guess_passwd': 'r2l', 'imap': 'r2l', 'multihop': 'r2l', 'phf': 'r2l',
        'spy': 'r2l', 'warezclient': 'r2l', 'warezmaster': 'r2l', 'sendmail': 'r2l', 'named': 'r2l', 'snmpgetattack': 'r2l',
        'snmpguess': 'r2l', 'xlock': 'r2l', 'xsnoop': 'r2l', 'httptunnel': 'r2l', 'buffer_overflow': 'u2r', 'loadmodule': 'u2r',
        'perl': 'u2r', 'rootkit': 'u2r', 'ps': 'u2r', 'sqlattack': 'u2r', 'xterm': 'u2r', 'attack': 'u2r'
    }
dataset['attack_type'] = dataset['attack_type'].map(attack_mapping)


dataset.to_csv("preprocessed_dataset.csv", index=False)

In [39]:
data = pd.read_csv('preprocessed_dataset.csv')
data = data.dropna(subset=['attack_type'])

X = data.drop(columns=['attack_type'])
y = data['attack_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Random Forest Classifier Results:")
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1: {f1:.4f}')

Random Forest Classifier Results:
Accuracy: 0.9989
Precision: 0.9989
Recall: 0.9989
F1: 0.9989
