In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random

In [2]:
train_url = 'NSL_KDD_Train.csv'
test_url = 'NSL_KDD_Test.csv'

In [3]:
col_names = ["duration", "protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


df = pd.read_csv(train_url,header=None, names = col_names, low_memory=False)

df_test = pd.read_csv(test_url, header=None, names = col_names, low_memory=False)

print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (125973, 42)
Dimensions of the Test set: (22544, 42)


In [4]:
df.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [5]:
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
label
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: count, dtype: int64

Label distribution Test set:
label
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattac

In [6]:
print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories

Distribution of categories in service:
service
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: count, dtype: int64


In [7]:
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))


Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


In [8]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values.head()



Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [9]:
# Protocol Type
unique_protocol_test = sorted(df_test.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2_test = [string1 + x for x in unique_protocol_test]

# Service
unique_service_test = sorted(df_test.service.unique())
string2 = 'service_'
unique_service2_test = [string2 + x for x in unique_service_test]

# Flag
unique_flag_test = sorted(df_test.flag.unique())
string3 = 'flag_'
unique_flag2_test = [string3 + str(x) for x in unique_flag_test]

# Put together
testdumcols = unique_protocol2_test + unique_service2_test + unique_flag2_test

In [10]:
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)

print(df_categorical_values.head())
print('--------------------')
print(df_categorical_values_enc.head())

# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

  protocol_type   service flag
0           tcp  ftp_data   SF
1           udp     other   SF
2           tcp   private   S0
3           tcp      http   SF
4           tcp      http   SF
--------------------
   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


In [11]:
# Convert all values in categorical columns to strings
df_categorical_values = df_categorical_values.astype(str)
testdf_categorical_values = testdf_categorical_values.astype(str)

# Combine categorical values from training and test datasets
combined_categorical_values = pd.concat([df_categorical_values, testdf_categorical_values], axis=0)

# Fit the OneHotEncoder on the combined data
enc = OneHotEncoder(categories='auto')
enc.fit(combined_categorical_values)


# Manually create the feature names
feature_names = []
for i, col in enumerate(categorical_columns):
    for category in enc.categories_[i]:
        feature_names.append(f"{col}_{category}")

# Transform the training and test sets
df_categorical_values_encenc = enc.transform(df_categorical_values)
testdf_categorical_values_encenc = enc.transform(testdf_categorical_values)

# Convert the transformed data into DataFrames
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(), columns=feature_names)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(), columns=feature_names)



In [12]:
df_cat_data.head()

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
# Identify unique 'service' values in training and test sets
trainservice = set(df)
testservice = set(df_test)

# Find the differences
difference = list(trainservice - testservice)

# Prepend 'service_' to the differing values
# string = 'service_'
# difference = [string + x for x in difference]

# Add missing columns to the test data and initialize them with 0
for col in difference:
    df_test[col] = 0

df.drop(['flag', 'protocol_type', 'service'], axis=1, inplace=True)
df_test.drop(['flag', 'protocol_type', 'service'], axis=1, inplace=True)

print(df.shape)
print(df_test.shape)
print(df.columns)
print(df_test.columns)


(125973, 39)
(22544, 39)
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell'

In [14]:


labeldf=df['label']
labeldf_test=df_test['label']


# change the label column
newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})



# put the new label column back
df['label'] = newlabeldf
df_test['label'] = newlabeldf_test



In [16]:
# Define labels to be dropped
to_drop_labels = [0, 1, 2, 3, 4]

# Filter out rows with labels to be dropped
filtered_df = df[df['label'].isin(to_drop_labels)]
filtered_df_test = df_test[df_test['label'].isin(to_drop_labels)]

print('Train:')
print('Dimensions of filtered data:', filtered_df.shape)
print()
print('Test:')
print('Dimensions of filtered test data:', filtered_df_test.shape)


Train:
Dimensions of filtered data: (125973, 39)

Test:
Dimensions of filtered test data: (22544, 39)


In [17]:
X = filtered_df.drop('label', axis=1)
Y = filtered_df.label

X_test = filtered_df_test.drop('label', axis=1)
Y_test = filtered_df_test.label

In [18]:
colNames=list(X)
colNames_test=list(X_test)

In [19]:
# Reset the indices
X_test = X_test.reset_index(drop=True)
testdf_cat_data = testdf_cat_data.reset_index(drop=True)

# Concatenate X_test and testdf_cat_data
X_test = pd.concat([X_test, testdf_cat_data], axis=1)


In [20]:
# Now you can concatenate without considering indices
X = pd.concat([X.reset_index(drop=True), df_cat_data.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), testdf_cat_data.reset_index(drop=True)], axis=1)


# Reset the index of df_cat_data
df_cat_data = df_cat_data.reset_index(drop=True)

# Do the same for the test set
testdf_cat_data = testdf_cat_data.reset_index(drop=True)

# Now you can concatenate without overlapping indices
X = pd.concat([X, df_cat_data], axis=1)
X_test = pd.concat([X_test, testdf_cat_data], axis=1)


In [24]:
print(X.std(axis=0))

transformed_duration          NaN
transformed_src_bytes         NaN
transformed_dst_bytes         NaN
transformed_land              NaN
transformed_wrong_fragment    NaN
                             ... 
cat_flag_S1                   0.0
cat_flag_S2                   0.0
cat_flag_S3                   0.0
cat_flag_SF                   0.0
cat_flag_SH                   0.0
Length: 2654, dtype: float64


In [25]:
# Get the column names from the original DataFrame
original_columns = filtered_df.drop('label', axis=1).columns

# Create new column names for the transformed data
transformed_columns = [f"transformed_{col}" for col in original_columns]

# Combine the original and one-hot encoded feature names
all_feature_names = transformed_columns + list(df_cat_data.columns)

# Get the names of selected features
selected_feature_names = [all_feature_names[i] for i in selected_feature_indices]

# Print the names of selected features
print("Selected feature names:", selected_feature_names)

NameError: name 'selected_feature_indices' is not defined

In [73]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

# Scale the input features to ensure they are non-negative
scaler = MinMaxScaler()
X_new = scaler.fit_transform(X)
selector_chi2 = SelectKBest(chi2, k=20)

X_new_chi2 = selector_chi2.fit_transform(X_new, Y)
colindex3 = selector_chi2.get_support(indices=True)
colname3 = [colNames[i] for i in colindex3]
colname3

IndexError: list index out of range

In [1]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Create a SelectKBest instance with mutual information
selector_mi = SelectKBest(mutual_info_regression, k=20)  # Select the top 20 features based on mutual information

# Fit and transform the data for feature selection based on mutual information
X_new_mi = selector_mi.fit_transform(X, Y)
colindex5 = selector_mi.get_support(indices=True)
colname5 = [colNames[i] for i in colindex5]
colname5


NameError: name 'X' is not defined

In [2]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Create a Lasso model for feature selection
lasso = Lasso(alpha=0.001)  # You can adjust the alpha parameter for regularization strength

# Use SelectFromModel to select features based on Lasso coefficients
selector_lasso = SelectFromModel(lasso, max_features=20)  # Select only 20 features

# Fit and transform the data for DoS
X_new = selector_lasso.fit_transform(X, Y)
colindex4 = selector_lasso.get_support(indices=True)
colname4 = [colNames[i] for i in colindex4]
colname4

NameError: name 'X' is not defined

In [3]:
from collections import Counter

def common_in_three_or_more(lists):
    # Count occurrences of each number across all lists
    count = Counter()
    for lst in lists:
        count.update(set(lst))

    # Select numbers that are common in at least three lists
    common_numbers = [num for num, freq in count.items() if freq >= 3]

    return common_numbers

# Example usage:
list1 = colname1
list2 = colname2
list3 = colname3
list4 = colname4
list5 = colname5


lists = [list1, list2, list3, list4, list5]

result = common_in_three_or_more(lists)
print("Numbers common in at least three lists:",result)
print(len(result))



NameError: name 'colname1' is not defined

In [4]:
l=[]
for i in result:
   l.append(colNames.index(i))

print(l)

NameError: name 'result' is not defined

In [5]:
# from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier


# clf = RandomForestClassifier(n_estimators=10,n_jobs=2)
# rfe = RFE(estimator=clf, n_features_to_select=12, step=1)
# rfe.fit(X, Y.astype(int))
# X_rfe=rfe.transform(X)

colindex=l
colname= result

selected_features = [X[:, index] for index in colindex]
X_rfe = np.column_stack(selected_features)

print(colindex)

NameError: name 'result' is not defined

In [6]:
print("FEATURES SELECTED BY ENSEMBLED METHOD \n\n")
print('Features selected for DoS:',colname)

FEATURES SELECTED BY ENSEMBLED METHOD 




NameError: name 'colname' is not defined

In [7]:
print(X_rfe.shape)


NameError: name 'X_rfe' is not defined

In [8]:
clf_rf=RandomForestClassifier(n_estimators=10,n_jobs=1)
clf_rf.fit(X_rfe, Y.astype(int))


NameError: name 'X_rfe' is not defined

In [9]:
Y_pred2=clf_rf.predict(X_test2)
# Create confusion matrix
pd.crosstab(Y_test, Y_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])


NameError: name 'X_test2' is not defined

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Compute metrics with appropriate averaging
accuracy = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

precision = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

f1 = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f1.mean(), f1.std() * 2))


NameError: name 'X_test2' is not defined

In [12]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Calculate ROC curve for each class
Y_probs = clf_rf.predict_proba(X_test2)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(Y_probs[0])):
    fpr[i], tpr[i], _ = roc_curve((Y_test == i).astype(int), Y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(10, 6))
for i in range(len(Y_probs[0])):
    plt.plot(fpr[i], tpr[i], label='ROC curve (class {}) (area = {:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


NameError: name 'X_test2' is not defined

In [13]:
from sklearn.neighbors import KNeighborsClassifier
clf_KNN=KNeighborsClassifier()
clf_KNN.fit(X_rfe, Y.astype(int))


NameError: name 'X_rfe' is not defined

In [14]:
Y_pred2=clf_KNN.predict(X_test2)
# Create confusion matrix
pd.crosstab(Y_test, Y_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])


NameError: name 'X_test2' is not defined

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Compute metrics with appropriate averaging
accuracy = cross_val_score(clf_KNN, X_test2, Y_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

precision = cross_val_score(clf_KNN, X_test2, Y_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(clf_KNN, X_test2, Y_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

f1 = cross_val_score(clf_KNN, X_test2, Y_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f1.mean(), f1.std() * 2))


NameError: name 'X_test2' is not defined

In [16]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Calculate ROC curve for each class
Y_probs = clf_KNN.predict_proba(X_test2)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(Y_probs[0])):
    fpr[i], tpr[i], _ = roc_curve((Y_test == i).astype(int), Y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(10, 6))
for i in range(len(Y_probs[0])):
    plt.plot(fpr[i], tpr[i], label='ROC curve (class {}) (area = {:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


NameError: name 'X_test2' is not defined

In [17]:
from sklearn.svm import SVC

clf_SVM=SVC(kernel='linear', C=1.0, random_state=0)

In [18]:
clf_SVM.fit(X_rfe, Y.astype(int))

NameError: name 'X_rfe' is not defined

In [19]:
Y_pred2=clf_SVM.predict(X_test2)

# Create confusion matrix
pd.crosstab(Y_test, Y_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])

NameError: name 'X_test2' is not defined

In [20]:
Y_pred2

NameError: name 'Y_pred2' is not defined

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred2)

print("Accuracy:", accuracy)
# Calculate precision
precision = precision_score(Y_test, Y_pred2, average='weighted')
print("Precision:", precision)
recall = recall_score(Y_test, Y_pred2, average='weighted')

print("Recall:", recall)
f1 = f1_score(Y_test, Y_pred2, average='weighted')

print("F1-score:", f1)


NameError: name 'Y_test' is not defined

In [22]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred2))

NameError: name 'Y_test' is not defined

In [23]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Fit the SVM classifier

# Predict probabilities for each class
Y_probs = clf_SVM.decision_function(X_test2)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(Y_probs[0])):
    fpr[i], tpr[i], _ = roc_curve((Y_test == i).astype(int), Y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(10, 6))
for i in range(len(Y_probs[0])):
    plt.plot(fpr[i], tpr[i], label='ROC curve (class {}) (area = {:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


NameError: name 'X_test2' is not defined

In [24]:
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.naive_bayes import GaussianNB

# Create Gaussian Naive Bayes classifiers
clf_NB = GaussianNB()

In [25]:
clf_NB.fit(X_rfe, Y.astype(int))


NameError: name 'X_rfe' is not defined

In [26]:
Y_pred2 = clf_NB.predict(X_test2)
pd.crosstab(Y_test, Y_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])


NameError: name 'X_test2' is not defined

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred2)

print("Accuracy:", accuracy)
# Calculate precision
precision = precision_score(Y_test, Y_pred2, average='weighted')
print("Precision:", precision)
recall = recall_score(Y_test, Y_pred2, average='weighted')

print("Recall:", recall)
f1 = f1_score(Y_test, Y_pred2, average='weighted')

print("F1-score:", f1)


NameError: name 'Y_test' is not defined

In [28]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred2))


NameError: name 'Y_test' is not defined

In [29]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import GaussianNB
import numpy as np

# Instantiate and fit the Naive Bayes classifier


# Predict probabilities for each class
Y_probs = clf_NB.predict_proba(X_test2)

# Binarize the labels
y_test_binarized = label_binarize(Y_test, classes=np.unique(Y_test))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(np.unique(Y_test))
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], Y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(10, 6))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve (class {}) (area = {:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()



NameError: name 'X_test2' is not defined

In [30]:
from sklearn.linear_model import LogisticRegression

# Create Logistic Regression classifiers
clf_LR = LogisticRegression(random_state=0, max_iter=1000)

In [31]:
clf_LR.fit(X_rfe, Y.astype(int))


NameError: name 'X_rfe' is not defined

In [32]:
Y_pred2 = clf_LR.predict(X_test2)
pd.crosstab(Y_test, Y_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])


NameError: name 'X_test2' is not defined

In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Compute metrics with appropriate averaging
accuracy = cross_val_score(clf_LR, X_test2, Y_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

precision = cross_val_score(clf_LR, X_test2, Y_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(clf_LR, X_test2, Y_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

f1 = cross_val_score(clf_LR, X_test2, Y_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f1.mean(), f1.std() * 2))


NameError: name 'X_test2' is not defined

In [34]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Fit the Logistic Regression classifier

# Predict probabilities for each class
Y_probs = clf_LR.predict_proba(X_test2)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(Y_probs[0])):
    fpr[i], tpr[i], _ = roc_curve((Y_test == i).astype(int), Y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(10, 6))
for i in range(len(Y_probs[0])):
    plt.plot(fpr[i], tpr[i], label='ROC curve (class {}) (area = {:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


NameError: name 'X_test2' is not defined

In [35]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# Initialize classifiers with probability=True
clf_SVM = SVC(probability=True)
clf_NB = GaussianNB()
clf_LR = LogisticRegression(random_state=0, max_iter=1000)

In [36]:
clf_voting = VotingClassifier(estimators=[
    ('rf', clf_rf),
    ('knn', clf_KNN),
    ('svm', clf_SVM),
    ('NB', clf_NB),
    ('LR', clf_LR)
], voting='soft')

# Fit the VotingClassifier
clf_voting.fit(X_rfe, Y.astype(int))

NameError: name 'X_rfe' is not defined

In [37]:
import pickle as pkl
pkl.dump(clf_voting,open("finalpro.p","wb"))

In [38]:
l=pkl.load(open("finalpro.p","rb"))
# [8, 27, 30, 36, 19, 31, 32, 26, 28, 33, 21, 35, 23, 24, 29, 37, 6, 22, 34, 25]
input=(0,0,0.04,1,229,0.06,0,0.06,255,0,0,0,1,1,10,1,0,0,0,0.04)
i=np.asarray(input)
i1=i.reshape(1,-1)
prediction=l.predict(i1)

NotFittedError: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [39]:
prediction

NameError: name 'prediction' is not defined

In [40]:
# Predict using the VotingClassifier
Y_pred_voting = clf_voting.predict(X_test2)

# Create confusion matrix
confusion_matrix_voting = pd.crosstab(Y_test, Y_pred_voting, rownames=['Actual attacks'], colnames=['Predicted attacks'])

# Print the confusion matrix
print(confusion_matrix_voting)

NameError: name 'X_test2' is not defined

In [41]:
import numpy as np

# Assuming y_true contains the true labels and y_pred contains the predicted labels
# Example:
# y_true = [0, 1, 1, 0, 1]
# y_pred = [0, 1, 0, 0, 1]

# Convert lists to numpy arrays for easier manipulation
y_true = np.array(Y_test)
y_pred = np.array(Y_pred2)

# Find the indices where true positive predictions occur
tp_indices = np.where((y_true == 1) & (y_pred == 1))[0]

print("Indices of True Positive Predictions:", tp_indices)

# If you want to extract the corresponding rows from your dataset, assuming your dataset is a numpy array named 'data':
tp_rows = df[tp_indices]

print("Rows corresponding to True Positive Predictions:")
print(tp_rows)


NameError: name 'Y_test' is not defined

In [42]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Compute metrics with appropriate averaging
accuracy = cross_val_score(clf_voting, X_test2, Y_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

precision = cross_val_score(clf_voting, X_test2, Y_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(clf_voting, X_test2, Y_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

f1 = cross_val_score(clf_voting, X_test2, Y_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f1.mean(), f1.std() * 2))


NameError: name 'X_test2' is not defined

In [43]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred2))


NameError: name 'Y_test' is not defined

In [44]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

# Predict probabilities for each class
Y_probs_voting = clf_voting.predict_proba(X_test2)

# Binarize the labels
y_test_binarized = label_binarize(Y_test, classes=np.unique(Y_test))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(np.unique(Y_test))
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], Y_probs_voting[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(10, 6))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve (class {}) (area = {:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


NameError: name 'X_test2' is not defined