In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random

In [2]:
train_url = 'test.csv'
test_url = 'train.csv'

In [3]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


df = pd.read_csv(train_url,header=None, names = col_names)

df_test = pd.read_csv(test_url, header=None, names = col_names)

print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

  df = pd.read_csv(train_url,header=None, names = col_names)


Dimensions of the Training set: (22544, 42)
Dimensions of the Test set: (125973, 42)


In [4]:
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
label
21    10694
18     2967
20     1343
15     1176
17     1168
19      890
14      736
16      681
13      519
12      486
11      461
7       249
10      195
6       157
8       131
0       123
3       116
9       106
5       103
4       101
1        87
2        55
Name: count, dtype: int64

Label distribution Test set:
label
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: count, dtype: int64


In [5]:
print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'duration' has 3 categories
Feature 'protocol_type' has 64 categories
Feature 'service' has 11 categories
Feature 'dst_host_serror_rate' has 190 categories
Feature 'dst_host_srv_rerror_rate' has 38 categories

Distribution of categories in service:
service
SF      14875
REJ      3850
S0       2013
RSTO      773
RSTR      669
Name: count, dtype: int64


In [6]:
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories


In [7]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,private,REJ,0
0,private,REJ,0
0,ftp_data,SF,12983
0,eco_i,SF,20
0,telnet,RSTO,0


In [8]:
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
print(unique_protocol2)

# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
print(unique_service2)

['Protocol_type_IRC', 'Protocol_type_X11', 'Protocol_type_Z39_50', 'Protocol_type_auth', 'Protocol_type_bgp', 'Protocol_type_courier', 'Protocol_type_csnet_ns', 'Protocol_type_ctf', 'Protocol_type_daytime', 'Protocol_type_discard', 'Protocol_type_domain', 'Protocol_type_domain_u', 'Protocol_type_echo', 'Protocol_type_eco_i', 'Protocol_type_ecr_i', 'Protocol_type_efs', 'Protocol_type_exec', 'Protocol_type_finger', 'Protocol_type_ftp', 'Protocol_type_ftp_data', 'Protocol_type_gopher', 'Protocol_type_hostnames', 'Protocol_type_http', 'Protocol_type_http_443', 'Protocol_type_imap4', 'Protocol_type_iso_tsap', 'Protocol_type_klogin', 'Protocol_type_kshell', 'Protocol_type_ldap', 'Protocol_type_link', 'Protocol_type_login', 'Protocol_type_mtp', 'Protocol_type_name', 'Protocol_type_netbios_dgm', 'Protocol_type_netbios_ns', 'Protocol_type_netbios_ssn', 'Protocol_type_netstat', 'Protocol_type_nnsp', 'Protocol_type_nntp', 'Protocol_type_ntp_u', 'Protocol_type_other', 'Protocol_type_pm_dump', 'Pro

In [10]:
# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2 = [string3 + str(x) for x in unique_flag]
print(unique_flag2)

['flag_0', 'flag_1', 'flag_2', 'flag_5', 'flag_6', 'flag_7', 'flag_8', 'flag_9', 'flag_10', 'flag_11', 'flag_12', 'flag_13', 'flag_14', 'flag_15', 'flag_16', 'flag_17', 'flag_18', 'flag_19', 'flag_20', 'flag_21', 'flag_22', 'flag_23', 'flag_24', 'flag_25', 'flag_26', 'flag_27', 'flag_28', 'flag_29', 'flag_30', 'flag_31', 'flag_32', 'flag_33', 'flag_34', 'flag_35', 'flag_36', 'flag_37', 'flag_38', 'flag_40', 'flag_41', 'flag_42', 'flag_43', 'flag_44', 'flag_45', 'flag_46', 'flag_47', 'flag_48', 'flag_49', 'flag_50', 'flag_51', 'flag_52', 'flag_53', 'flag_54', 'flag_55', 'flag_56', 'flag_59', 'flag_61', 'flag_63', 'flag_64', 'flag_66', 'flag_67', 'flag_69', 'flag_70', 'flag_72', 'flag_74', 'flag_75', 'flag_76', 'flag_77', 'flag_78', 'flag_79', 'flag_80', 'flag_81', 'flag_85', 'flag_86', 'flag_89', 'flag_90', 'flag_91', 'flag_93', 'flag_94', 'flag_95', 'flag_96', 'flag_97', 'flag_99', 'flag_101', 'flag_102', 'flag_103', 'flag_105', 'flag_107', 'flag_109', 'flag_110', 'flag_111', 'flag_112

In [11]:
# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2

In [12]:
#do it for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2

In [13]:
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)

print(df_categorical_values.head())
print('--------------------')
print(df_categorical_values_enc.head())

  protocol_type service   flag
0       private     REJ      0
0       private     REJ      0
0      ftp_data      SF  12983
0         eco_i      SF     20
0        telnet    RSTO      0
--------------------
   protocol_type  service  flag
0             45        1     0
0             45        1     0
0             19        9   986
0             13        9    18
0             55        2     0


In [14]:
# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

In [15]:
enc = OneHotEncoder(categories='auto')
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols)

In [17]:
# Fit and transform the training data
enc = OneHotEncoder(categories='auto')
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)

# Generate column names from the encoder's categories_
column_names = [f"{col}_{val}" for col, vals in zip(categorical_columns, enc.categories_) for val in vals]

# Create the training dataframe
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(), columns=column_names)

# Fit and transform the test data
testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)

# Generate column names from the encoder's categories_
test_column_names = [f"{col}_{val}" for col, vals in zip(categorical_columns, enc.categories_) for val in vals]

# Create the test dataframe
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(), columns=test_column_names)

In [18]:
#Identify unique 'service' values in training and test sets
trainservice = set(df)
testservice = set(df_test)

# Find the differences
difference = list(trainservice - testservice)

# Prepend 'service_' to the differing values
# string = 'service_'
# difference = [string + x for x in difference]

# Add missing columns to the test data and initialize them with 0
for col in difference:
    df_test[col] = 0

df.drop(['flag', 'protocol_type', 'service'], axis=1, inplace=True)
df_test.drop(['flag', 'protocol_type', 'service'], axis=1, inplace=True)

print(df.shape)
print(df_test.shape)
print(df.columns)
print(df_test.columns)

(22544, 39)
(125973, 39)
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell'

In [22]:
columns_to_drop = ['flag', 'protocol_type', 'service']

# Check if the columns exist in the dataframe before dropping
for col in columns_to_drop:
    if col in df.columns:
        df = df.drop(col, axis=1)
    if col in df_test.columns:
        df_test = df_test.drop(col, axis=1)


In [25]:
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,tcp,0,0,0,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0,1.0,1.0,neptune,21
0,tcp,0,0,0,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0,1.0,1.0,neptune,21
0,tcp,0,0,0,0,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0,0.0,0.0,normal,21
0,icmp,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0,0.0,0.0,saint,15
0,tcp,15,0,0,0,0,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0,0.83,0.71,mscan,11


In [44]:
# Identify unique 'service' values in training and test sets
trainservice = set(df)
testservice = set(df_test)

# Find the differences
difference = list(trainservice - testservice)

# Prepend 'service_' to the differing values
# string = 'service_'
# difference = [string + x for x in difference]

# Add missing columns to the test data and initialize them with 0
for col in difference:
    df_test[col] = 0

print(df.shape)
print(df_test.shape)
print(df.columns)
print(df_test.columns)

(22544, 39)
(125973, 39)
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell'

In [45]:
# Identify unique 'service' values in training and test sets
trainservice = set(df)
testservice = set(df_test)

# Find the differences
difference = list(trainservice - testservice)

# Prepend 'service_' to the differing values
# string = 'service_'
# difference = [string + x for x in difference]

# Add missing columns to the test data and initialize them with 0
for col in difference:
    df_test[col] = 0


print(df.shape)
print(df_test.shape)
print(df.columns)
print(df_test.columns)

(22544, 39)
(125973, 39)
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')
Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell'

In [47]:
labeldf=df['label']
labeldf_test=df_test['label']

In [48]:
newlabeldf = labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test = labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})


In [49]:
# put the new label column back
df['label'] = newlabeldf
df_test['label'] = newlabeldf_test

In [50]:
# Define labels to be dropped
to_drop_labels = [0, 1, 2, 3, 4]

# Filter out rows with labels to be dropped
filtered_df = df[df['label'].isin(to_drop_labels)]
filtered_df_test = df_test[df_test['label'].isin(to_drop_labels)]

print('Train:')
print('Dimensions of filtered data:', filtered_df.shape)
print()
print('Test:')
print('Dimensions of filtered test data:', filtered_df_test.shape)

Train:
Dimensions of filtered data: (482, 39)

Test:
Dimensions of filtered test data: (125973, 39)


In [52]:
X = filtered_df.drop('label', axis=1)
Y = filtered_df.label

X_test = filtered_df_test.drop('label', axis=1)
Y_test = filtered_df_test.label


In [53]:
colNames=list(X)
colNames_test=list(X_test)

In [57]:
# Convert 'duration', 'dst_host_serror_rate', and 'dst_host_srv_rerror_rate' to numeric
X['duration'] = pd.to_numeric(X['duration'], errors='coerce')
X['dst_host_serror_rate'] = pd.to_numeric(X['dst_host_serror_rate'], errors='coerce')
X['dst_host_srv_rerror_rate'] = pd.to_numeric(X['dst_host_srv_rerror_rate'], errors='coerce')

# Similarly, apply the same conversion to the test data
X_test['duration'] = pd.to_numeric(X_test['duration'], errors='coerce')
X_test['dst_host_serror_rate'] = pd.to_numeric(X_test['dst_host_serror_rate'], errors='coerce')
X_test['dst_host_srv_rerror_rate'] = pd.to_numeric(X_test['dst_host_srv_rerror_rate'], errors='coerce')

# Now, proceed with the scaling step
scaler1 = StandardScaler().fit(X)
X_scaled = scaler1.transform(X)
X_test_scaled = scaler1.transform(X_test)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [56]:
print(X.dtypes)

duration                        object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                      float64
serror_rate                    float64
srv_serror_rate                float64
rerror_rate                    float64
srv_rerror_rate                float64
same_srv_rate            

In [58]:
from sklearn.impute import SimpleImputer

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Fit the imputer to the training data and transform it
X_imputed = imputer.fit_transform(X)

# Apply the same imputer to the test data
X_test_imputed = imputer.transform(X_test)

# Now, proceed with the scaling step
scaler1 = StandardScaler().fit(X_imputed)
X_scaled = scaler1.transform(X_imputed)
X_test_scaled = scaler1.transform(X_test_imputed)




In [59]:
print(X.std(axis=0))

duration                                NaN
src_bytes                      47553.753961
dst_bytes                          0.000000
land                               0.436394
wrong_fragment                     0.243558
urgent                             1.275460
hot                                0.213218
num_failed_logins                  0.402886
logged_in                          8.534452
num_compromised                    0.249221
root_shell                         0.045549
su_attempted                      10.710468
num_root                           0.661119
num_file_creations                 0.210572
num_shells                         0.277335
num_access_files                   0.000000
num_outbound_cmds                  0.090813
is_host_login                      0.064349
is_guest_login                    33.381658
count                             11.311300
srv_count                          0.045549
serror_rate                        0.045549
srv_serror_rate                 

In [61]:
from sklearn.impute import SimpleImputer

# Create an imputer object that replaces NaN values with the mean value of the column
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit the imputer on your data
imputer.fit(X)

# Transform the data
X = imputer.transform(X)

# Do the same for the test set
imputer.fit(X_test)
X_test = imputer.transform(X_test)




In [62]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=10,n_jobs=2)
rfe = RFE(estimator=clf, n_features_to_select=20, step=1)

rfe.fit(X, Y.astype(int))
X_new=rfe.transform(X)
true=rfe.support_
colindex2=[i for i, x in enumerate(true) if x]
colname2=list(colNames[i] for i in colindex2)
colname2

['duration',
 'wrong_fragment',
 'hot',
 'num_failed_logins',
 'logged_in',
 'root_shell',
 'is_host_login',
 'is_guest_login',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate']

In [63]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Create a SelectKBest instance with mutual information
selector_mi = SelectKBest(mutual_info_regression, k=20)  # Select the top 20 features based on mutual information

# Fit and transform the data for feature selection based on mutual information
X_new_mi = selector_mi.fit_transform(X, Y)
colindex5 = selector_mi.get_support(indices=True)
colname5 = [colNames[i] for i in colindex5]
colname5

['duration',
 'logged_in',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'is_host_login',
 'is_guest_login',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate']

In [65]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Create a Lasso model for feature selection
lasso = Lasso(alpha=0.001)  # You can adjust the alpha parameter for regularization strength

# Use SelectFromModel to select features based on Lasso coefficients
selector_lasso = SelectFromModel(lasso, max_features=20)  # Select only 20 features

# Fit and transform the data for DoS
X_new = selector_lasso.fit_transform(X, Y)
colindex4 = selector_lasso.get_support(indices=True)
colname4 = [colNames[i] for i in colindex4]

# Print the selected column names
print(colname4)


['land', 'urgent', 'logged_in', 'num_compromised', 'su_attempted', 'num_root', 'num_file_creations', 'num_access_files', 'num_outbound_cmds', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate']


In [67]:
import pandas as pd

# Create some example DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'X': ['a', 'b', 'c'], 'Y': ['d', 'e', 'f']})

# Get a dictionary of all global variables
global_vars = globals()

# Filter out only the variables that are DataFrames
dataframes_names = [name for name, var in global_vars.items() if isinstance(var, pd.DataFrame)]

# Print the names of the DataFrames
print("DataFrames names:", dataframes_names)


DataFrames names: ['___', 'df', 'df_test', 'df_categorical_values', 'testdf_categorical_values', '_7', 'df_categorical_values_enc', 'testdf_categorical_values_enc', 'df_cat_data', 'testdf_cat_data', '_25', 'filtered_df', 'filtered_df_test', 'df1', 'df2']


In [77]:
from collections import Counter

def common_in_three_or_more(lists):
    # Count occurrences of each number across all lists
    count = Counter()
    for lst in lists:
        count.update(set(lst))

    # Select numbers that are common in at least three lists
    common_numbers = [num for num, freq in count.items() if freq >= 3]

    return common_numbers



In [78]:
l=[]
for i in result:
   l.append(colNames.index(i))

print(l)

[]


In [79]:
# from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier


# clf = RandomForestClassifier(n_estimators=10,n_jobs=2)
# rfe = RFE(estimator=clf, n_features_to_select=12, step=1)
# rfe.fit(X, Y.astype(int))
# X_rfe=rfe.transform(X)

colindex=l
colname= result

selected_features = [X[:, index] for index in colindex]
X_rfe = np.column_stack(selected_features)

print(colindex)

ValueError: need at least one array to concatenate

In [86]:
# Ensure colindex is a list of integers
colindex = [31, 24, 28, 32, 23, 25, 21, 19, 30, 34, 33, 35, 22, 35, 27, 8, 29, 36, 26, 20]

# Check if colindex is not empty and all indices are valid
if colindex and all(i >= 0 and i < X.shape[1] for i in colindex):
    selected_features = [X[:, index] for index in colindex]
    X_rfe = np.column_stack(selected_features)
    print(colindex)
else:
    print("colindex is empty or contains invalid indices.")


colindex is empty or contains invalid indices.


In [87]:
print("Shape of X:", X.shape)
print("Max index in colindex:", max(colindex))


Shape of X: (482, 36)
Max index in colindex: 36


In [89]:
X_rfe = np.column_stack(selected_features)
int(X_rfe.shape)


clf_rf=RandomForestClassifier(n_estimators=10,n_jobs=1)
clf_rf.fit(X_rfe, Y.astype(int))



X_test2=X_test[:,colindex]

ValueError: need at least one array to concatenate

In [90]:
Y_pred2=clf_rf.predict(X_test2)
# Create confusion matrix
pd.crosstab(Y_test, Y_pred2, rownames=['Actual attacks'], colnames=['Predicted attacks'])

NameError: name 'clf_rf' is not defined

In [91]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics




from sklearn.metrics import precision_score, recall_score, f1_score

# Compute metrics with appropriate averaging
accuracy = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

precision = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

f1 = cross_val_score(clf_rf, X_test2, Y_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f1.mean(), f1.std() * 2))


NameError: name 'clf_rf' is not defined

In [92]:
clf_rf=RandomForestClassifier(n_estimators=10,n_jobs=1)
clf_rf.fit(X_rfe, Y.astype(int))

NameError: name 'X_rfe' is not defined