In [1]:
import numpy as np
import pandas as pd

import pickle 
from prettytable import PrettyTable

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

%matplotlib inline

In [2]:
# Cargar los datos 
train = pd.read_csv('./train_alldata_EDA.csv')
test = pd.read_csv('./test_alldata_EDA.csv')
with open('./final_ipynb', 'rb') as file:
    # Cargar el diccionario desde el archivo
    saved_dict = pickle.load(file)

In [3]:
def multi_corr(col1, col2="label", df=train):
  
    #Calcula la correlación entre una columna y el atributo label
    corr = df[[col1, col2]].corr().iloc[0,1]
    log_corr = df[col1].apply(np.log1p).corr(df[col2])

    print("Correlation : {}\nlog_Correlation: {}".format(corr, log_corr))

In [4]:
def corr(col1, col2="label", df=train):
    
    #Calcula la correlación entre dos columnas
    return df[[col1, col2]].corr().iloc[0,1]

In [5]:
# Seleccionar los atributos con mayor correlación entre ellos
corr_matrix = train.corr().abs()

# Coger el triangulo superior ya que el inferior es una imagen de este y así reducir recursos
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Buscar atributos con una correlación superior a 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(to_drop)

['sloss', 'dloss', 'dpkts', 'dwin', 'ltime', 'ct_srv_dst', 'ct_src_dport_ltm', 'ct_dst_src_ltm']


In [6]:
saved_dict['corr_col'] = to_drop

In [7]:
# Eliminar estos atributos del train
train.drop(columns=to_drop, inplace=True)

In [8]:
train.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'service', 'sload', 'dload', 'spkts', 'swin',
       'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len',
       'sjit', 'djit', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'attack_cat', 'label'],
      dtype='object')

In [9]:
train.shape, test.shape

((1778032, 41), (762015, 49))

In [10]:
# Eliminar el resto de atributos que no son importante o no disponemos en nuestro dataset
# Además de attack_cat ya que es clasificación binaria
train.drop(['srcip', 'sport', 'dstip', 'dsport', 'attack_cat', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm', 'ct_dst_sport_ltm'], axis=1, inplace=True)

In [11]:
saved_dict['to_drop'] = ['srcip', 'sport', 'dstip', 'dsport', 'attack_cat', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm', 'ct_dst_sport_ltm']

In [12]:
train.shape, test.shape

((1778032, 28), (762015, 49))

In [13]:
# Seleccionar aquellas columnas con valores únicos superiores a 150
# y calcular su correlación respecto a el label
col_unique_values = train.nunique()

col = col_unique_values[col_unique_values>150].index

for column in col:
    print("{:-^30}".format(column))
    multi_corr(column)

-------------dur--------------
Correlation : 0.0019274028701131475
log_Correlation: -0.03254413756460623
------------sbytes------------
Correlation : 0.010344749695229565
log_Correlation: -0.35616315558984374
------------dbytes------------
Correlation : -0.07641408324436148
log_Correlation: -0.5193868283741477
------------sload-------------
Correlation : 0.19211948100086756
log_Correlation: 0.3474660145034949
------------dload-------------
Correlation : -0.21978094390126515
log_Correlation: -0.6033545881626365
------------spkts-------------
Correlation : -0.12200425437154418
log_Correlation: -0.3163533826967563
------------stcpb-------------
Correlation : -0.23365153315010911
log_Correlation: -0.3135563222142905
------------dtcpb-------------
Correlation : -0.23346071773809843
log_Correlation: -0.3134006479812102
-----------smeansz------------
Correlation : -0.06517990378993671
log_Correlation: -0.15111450989648337
-----------dmeansz------------
Correlation : -0.27230605607442226
log_C

In [14]:
# Columnas a las que se va a aplicar log1p
log1p_col = ['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit']

In [15]:
saved_dict['log1p_col'] = log1p_col

In [16]:
# mode values of every features, will use to fill Null values of test
mode_dict = train.mode().iloc[0].to_dict()

In [17]:
def log1p_transform(col, df=train):
  
    #Calcular el log1p de una columna, añadirlo al dataset y eliminar la original
    
    new_col = col+'_log1p'
    df[new_col] = df[col].apply(np.log1p)
    df.drop(col, axis=1, inplace=True)

In [18]:
# Transformar las columnas con log1p
for col in log1p_col:
    log1p_transform(col, df=train)

In [19]:
train.shape

(1778032, 28)

In [20]:
train.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'label', 'dur_log1p', 'sbytes_log1p',
       'dbytes_log1p', 'sload_log1p', 'dload_log1p', 'spkts_log1p',
       'stcpb_log1p', 'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p',
       'sjit_log1p', 'djit_log1p'],
      dtype='object')

In [21]:
train.shape, test.shape

((1778032, 28), (762015, 49))

In [22]:
# Crear x e y a partir del dataset
x_train, y_train = train.drop(columns=['label']), train['label']
x_test, y_test = test.drop(columns=['label']), test['label']

In [23]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(1778032, 27) (1778032,)
(762015, 48) (762015,)


In [24]:
# Almacenar estos datos
pickle.dump((x_train, y_train), open('./final-ipynb/final_train.pkl', 'wb'))
pickle.dump((x_test, y_test), open('./final-ipynb/final_test.pkl', 'wb'))

In [25]:
# Seleccionar que columnas son categoricas y cuales numéricas
cat_col = ['proto', 'service', 'state']
num_col = list(set(x_train.columns) - set(cat_col))

In [26]:
# To use later, during test data cleaning
saved_dict['cat_col'] = cat_col
saved_dict['num_col'] = num_col

In [27]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,dbytes_log1p,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p
0,udp,INT,254,0,,0,0,0,1421930643,33.479,...,0.0,9.2756,0.0,1.609438,0.0,0.0,3.828641,0.0,3.878042,0.0
1,udp,INT,60,0,dns,0,0,0,1424246229,0.008,...,0.0,18.698312,0.0,1.098612,0.0,0.0,4.890349,0.0,0.0,0.0
2,tcp,FIN,31,29,,255,0,0,1421948071,0.372205,...,10.048583,14.105347,16.314201,3.713572,20.196135,21.733479,4.174387,6.313548,0.0,3.01207
3,tcp,FIN,31,29,ftp,255,0,0,1421971944,16.14474,...,8.227108,10.258074,10.501435,3.970292,21.803017,20.49442,4.043051,4.248495,7.264606,3.984562
4,tcp,FIN,31,29,,255,0,0,1421963050,1.2188,...,7.405496,13.339317,13.412088,2.833213,20.673269,21.855078,4.574711,4.521789,4.309533,1.138118


In [28]:
# Estandarizar los datos númericos
scaler = StandardScaler()
scaler = scaler.fit(x_train[num_col])
x_train[num_col] = scaler.transform(x_train[num_col])

In [29]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,dbytes_log1p,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p
0,udp,INT,2.561444,-0.71776,,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,...,-1.703404,-1.383776,-1.879995,-0.704801,-1.190007,-1.189697,-1.137341,-1.850553,0.20427,-0.899657
1,udp,INT,-0.037542,-0.71776,dns,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,...,-1.703404,1.614925,-1.879995,-1.080734,-1.190007,-1.189697,0.565988,-1.850553,-0.995343,-0.899657
2,tcp,FIN,-0.426051,-0.041365,,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,...,0.911063,0.153252,0.988351,0.843701,0.741629,0.888925,-0.582651,0.881301,-0.995343,0.170283
3,tcp,FIN,-0.426051,-0.041365,ftp,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,...,0.437147,-1.071111,-0.033644,1.03263,0.895318,0.77042,-0.793357,-0.012242,1.251851,0.51573
4,tcp,FIN,-0.426051,-0.041365,,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,...,0.223377,-0.090531,0.478104,0.195816,0.787264,0.900555,0.059601,0.106012,0.337745,-0.495378


In [30]:
# Definición de encoders one-hot de variables categóricas 
service_ = OneHotEncoder()
proto_ = OneHotEncoder()
state_ = OneHotEncoder()
ohe_service = service_.fit(x_train.service.values.reshape(-1,1))
ohe_proto = proto_.fit(x_train.proto.values.reshape(-1,1))
ohe_state = state_.fit(x_train.state.values.reshape(-1,1))

In [31]:
# Codificación one-hot de las columnas categoricas y eliminar las originales
for col, ohe in zip(['proto', 'service', 'state'], [ohe_proto, ohe_service, ohe_state]):
    x = ohe.transform(x_train[col].values.reshape(-1,1))
    tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+i for i in ohe.categories_[0]])
    x_train = pd.concat([x_train.drop(col, axis=1), tmp_df], axis=1)

In [32]:
x_train.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,2.561444,-0.71776,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.037542,-0.71776,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,-0.054857,-0.121383,-0.107394,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,-0.044512,-0.12083,-0.107159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,-0.054358,-0.123351,-0.110421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
file_path = './final-ipynb/'

In [34]:
pickle.dump(scaler, open(file_path+'scaler.pkl', 'wb'))  # Standard scaler
pickle.dump(saved_dict, open(file_path+'saved_dict.pkl', 'wb'))  # Dictionary with important parameters
pickle.dump(mode_dict, open(file_path+'mode_dict.pkl', 'wb'))  #  Dictionary with most frequent values of columns

In [35]:
# Guardar los one-hot encoders de los atributos categóricos
pickle.dump(ohe_proto, open(file_path+'ohe_proto.pkl', 'wb'))
pickle.dump(ohe_service, open(file_path+'ohe_service.pkl', 'wb'))
pickle.dump(ohe_state, open(file_path+'ohe_state.pkl', 'wb'))

In [36]:
pickle.dump((x_train, y_train), open(file_path+'final_train.pkl', 'wb'))