In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

plt.rcParams["figure.figsize"] = (20,12)
plt.rcParams['axes.grid'] = True
plt.style.use('fivethirtyeight')
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['lines.linewidth'] = 3


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator
from sklearn.feature_selection import VarianceThreshold, SelectKBest

In [2]:
path = os.path.join('dataset', 'train_students.csv')

In [3]:
data = pd.read_csv(path)

In [4]:
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,1,tcp,smtp,SF,2599,293,0,0,0,0,...,74,0.29,0.14,0.00,0.00,0.0,0.0,0.69,0.0,Dos
1,0,tcp,login,S0,0,0,0,0,0,0,...,18,0.07,0.07,0.00,0.00,1.0,1.0,0.00,0.0,Dos
2,0,tcp,http,SF,280,13254,0,0,0,0,...,254,1.00,0.00,0.08,0.04,0.0,0.0,0.00,0.0,normal
3,0,tcp,http,SF,230,1582,0,0,0,0,...,255,1.00,0.00,0.05,0.02,0.0,0.0,0.00,0.0,normal
4,0,tcp,http,SF,218,483,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103956,0,tcp,echo,RSTO,0,0,0,0,0,0,...,4,0.02,0.09,0.00,0.00,0.0,0.0,1.00,1.0,Dos
103957,0,tcp,telnet,S0,0,0,0,0,0,0,...,4,0.02,0.07,0.01,0.00,1.0,1.0,0.00,0.0,Dos
103958,0,tcp,smtp,SF,0,83,0,0,0,0,...,124,0.49,0.03,0.00,0.00,0.0,0.0,0.00,0.0,normal
103959,50,tcp,telnet,SF,226,2615,0,0,0,1,...,2,0.01,0.02,0.00,0.00,0.0,0.0,0.00,0.0,R2L


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103961 entries, 0 to 103960
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     103961 non-null  int64  
 1   protocol_type                103961 non-null  object 
 2   service                      103961 non-null  object 
 3   flag                         103961 non-null  object 
 4   src_bytes                    103961 non-null  int64  
 5   dst_bytes                    103961 non-null  int64  
 6   land                         103961 non-null  int64  
 7   wrong_fragment               103961 non-null  int64  
 8   urgent                       103961 non-null  int64  
 9   hot                          103961 non-null  int64  
 10  num_failed_logins            103961 non-null  int64  
 11  logged_in                    103961 non-null  int64  
 12  num_compromised              103961 non-null  int64  
 13 

In [6]:
# dict with classes map
classes_map = {
    'normal' : 0,
    'Dos' : 1,
    'R2L' : 2,
    'U2R' : 3,
    'Probe' : 4
} 

In [7]:
data.attack_type = data.attack_type.map(classes_map)

In [8]:
def RemovingDuplicates(dataset):
    """ Function to remove duplicated rows given a dataset """
    try:
        dataset.drop_duplicates(inplace=True)
    except:
        print("Erro no Removing Duplicates!")

In [9]:
"""Get the columns name that are an object type"""
categorical_columns = data.select_dtypes(include='object').columns.tolist()
categorical_columns

['protocol_type', 'service', 'flag']

In [10]:
data.select_dtypes(include='object').columns.tolist()

['protocol_type', 'service', 'flag']

In [11]:
list(data.dtypes[data.dtypes == object].reset_index().index)

[0, 1, 2]

In [12]:
for i in categorical_columns:
    print(data[i].unique())

['tcp' 'udp' 'icmp']
['smtp' 'login' 'http' 'private' 'telnet' 'other' 'ecr_i' 'iso_tsap'
 'csnet_ns' 'ftp_data' 'imap4' 'ftp' 'time' 'daytime' 'pop_3' 'kshell'
 'uucp' 'name' 'domain_u' 'echo' 'whois' 'eco_i' 'rje' 'finger' 'Z39_50'
 'courier' 'discard' 'domain' 'auth' 'supdup' 'netbios_ns' 'IRC' 'netstat'
 'nnsp' 'netbios_dgm' 'link' 'hostnames' 'klogin' 'http_443' 'systat'
 'uucp_path' 'netbios_ssn' 'exec' 'sunrpc' 'bgp' 'nntp' 'mtp' 'ssh'
 'ntp_u' 'ldap' 'urp_i' 'gopher' 'ctf' 'efs' 'sql_net' 'tim_i' 'vmnet'
 'urh_i' 'shell' 'remote_job' 'X11' 'tftp_u' 'pop_2' 'red_i' 'printer'
 'pm_dump' 'aol' 'http_8001' 'harvest']
['SF' 'S0' 'RSTO' 'REJ' 'SH' 'RSTR' 'S2' 'S3' 'S1' 'RSTOS0' 'OTH']


In [13]:
aux = data["service"]
le = preprocessing.LabelEncoder()
le.fit(aux)
le.classes_
data["service"] = le.transform(aux)

In [14]:
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,1,tcp,53,SF,2599,293,0,0,0,0,...,74,0.29,0.14,0.00,0.00,0.0,0.0,0.69,0.0,1
1,0,tcp,33,S0,0,0,0,0,0,0,...,18,0.07,0.07,0.00,0.00,1.0,1.0,0.00,0.0,1
2,0,tcp,24,SF,280,13254,0,0,0,0,...,254,1.00,0.00,0.08,0.04,0.0,0.0,0.00,0.0,0
3,0,tcp,24,SF,230,1582,0,0,0,0,...,255,1.00,0.00,0.05,0.02,0.0,0.0,0.00,0.0,0
4,0,tcp,24,SF,218,483,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103956,0,tcp,13,RSTO,0,0,0,0,0,0,...,4,0.02,0.09,0.00,0.00,0.0,0.0,1.00,1.0,1
103957,0,tcp,59,S0,0,0,0,0,0,0,...,4,0.02,0.07,0.01,0.00,1.0,1.0,0.00,0.0,1
103958,0,tcp,53,SF,0,83,0,0,0,0,...,124,0.49,0.03,0.00,0.00,0.0,0.0,0.00,0.0,0
103959,50,tcp,59,SF,226,2615,0,0,0,1,...,2,0.01,0.02,0.00,0.00,0.0,0.0,0.00,0.0,2


In [15]:
column_service = aux

In [16]:
class DropUniqueColumns(BaseEstimator):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
    def fit(self, X, y=None):
        # drop columns with just one value
        print(type(X))
        X = pd.DataFrame(X)
        self.cols_to_drop = X.columns[X.nunique() == 1]
        # correlation = X.corr(numeric_only=True)
        # correlated_features = correlation.abs() > self.threshold
        return self

    def transform(self, X, y=None):
        X = pd.DataFrame(X)
        preprocessed = X.drop(self.cols_to_drop, axis=1)
        return preprocessed

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [18]:
X = data.copy()
y = X.pop('attack_type')

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [20]:
numeric_cols = [col for col in x_train if col not in categorical_columns]

In [21]:
class CustomEncoder(BaseEstimator):
    def __init__(self, cols=None):
        self.cols = cols

    def fit(self, X, y=None):
        self.encoder = LabelEncoder()
        self.encoders = {}
        print('fit start')
        for col in X:
            print(col)
            self.encoders[col] = LabelEncoder().fit(X[col])
        return self

    def transform(self, X, y=None):
        print('transform')
        X_encoded = X.copy()
        print('start')
        for col in X:
            print(col)
            X_encoded[col] = self.encoders[col].transform(X[col])

        return pd.DataFrame(X_encoded, columns=X.columns)


In [22]:
numerical_columns = [col for col in X if col not in categorical_columns]
numerical_columns

['duration',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

In [23]:
numerical_columns

['duration',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

In [41]:
from sklearn.feature_selection import SelectKBest, f_classif

In [52]:
pipeline = Pipeline([
    ('categorical_cols', ColumnTransformer(
        transformers=[
            ('cat', CustomEncoder(), categorical_columns),
            ('drop_cols', DropUniqueColumns(), numerical_columns)
        ]))
    ,
    # ('feature_selection', SelectKBest(f_classif, k=30)), 
    ('variance_drop', VarianceThreshold(0)),
    ('std_scaler', StandardScaler())
])

In [53]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', CustomEncoder(), categorical_columns)
    ])

In [54]:
results = pipeline.fit_transform(x_train, y_train)

fit start
protocol_type
service
flag
transform
start
protocol_type
service
flag
<class 'pandas.core.frame.DataFrame'>


In [55]:
results.shape

(77970, 40)