# Tratamento de Dados
  * Limpeza de Dados: Remoção de entradas duplicadas ou irrelevantes.
  * Normalização: Escalonamento dos dados para que todas as variáveis tenham a mesma importância. Isso pode ser feito usando técnicas como Min-Max Scaling ou Z-score normalization.
  * Transformação de Variáveis Categóricas: Conversão de variáveis categóricas em numéricas, utilizando técnicas como one-hot encoding.
  * Redução de Dimensionalidade: Aplicação de técnicas como PCA (Análise de Componentes Principais) para reduzir o número de características, mantendo a maior quantidade de informação possível.

In [None]:
# importar bibliotecas necessárias
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
# pip install pandas numpy sklearn
import gc

#%pip install --upgrade scikit-learn xgboost

In [None]:
csv_files = ['Wednesday-workingHours.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv',
             'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
             'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv',
             "Friday-WorkingHours-Morning.pcap_ISCX.csv", "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
            "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"]

# Create an empty dataframe to store the combined data
dados = pd.DataFrame()

# Loop through each CSV file and append its contents to the combined dataframe
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    dados = pd.concat([dados, df])

In [None]:
dados

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.000000,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.000000,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225740,61374,61,1,1,6,6,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
225741,61378,72,1,1,6,6,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
225742,61375,75,1,1,6,6,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
225743,61323,48,2,0,12,0,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [None]:
dados.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [None]:
dados.shape

(2830743, 79)

1. # Tratamento de Valores Ausentes

In [None]:
#1. Limpeza de Dados (Remoção de Entradas Duplicadas ou Irrelevantes)
# Remover entradas duplicadas
orig_dados = dados
oring_label = dados[" Label"]
dados = dados.drop_duplicates()
# Remover linhas com valores nulos
dados = dados.dropna()
dados.reset_index(drop=True, inplace=True)

In [None]:
dados.shape

(2522009, 79)

In [None]:
dados[" Label"].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
BENIGN,2096134
DoS Hulk,172846
DDoS,128016
PortScan,90819
DoS GoldenEye,10286
FTP-Patator,5933
DoS slowloris,5385
DoS Slowhttptest,5228
SSH-Patator,3219
Bot,1953


2. # Detecção e Tratamento de Outliers
com z-score

In [None]:
#features que podem conter outlines
features_with_outlines = [
    'Flow Bytes/s', ' Flow Packets/s', 'Fwd Packets/s', ' Bwd Packets/s',
    ' Flow Duration', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
    'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
    'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
    'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
    ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
    ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min',
    ' Bwd Packet Length Mean', ' Bwd Packet Length Std',
    ' Average Packet Size', ' Packet Length Mean', ' Packet Length Std',
    ' Packet Length Variance', 'Active Mean', ' Active Std', ' Active Max', ' Active Min',
    'Idle Mean', ' Idle Std', ' Idle Max', ' Idle Min'
]


In [None]:
import pandas as pd
import numpy as np

# Função para remover outliers usando Z-Score
def remove_outliers_zscore(df, column, threshold=4):
    mean = df[column].mean()
    std_dev = df[column].std()
    z_scores = (df[column] - mean) / std_dev
    return df[np.abs(z_scores) < threshold]

# Função para aplicar remoção de outliers condicionalmente
def apply_outlier_removal_conditional(df, features_with_outlines, min_class_size=90000):
    # Separar o DataFrame pelas classes
    class_groups = df.groupby(' Label')

    df_cleaned = pd.DataFrame()  # DataFrame para armazenar os dados limpos
    threshold = 3
    for class_label, group in class_groups:
        old_len = len(group)
        old_group = group.copy()
        if old_len > min_class_size:  # Usar o parâmetro correto
            print(f"Classe '{class_label}' possui {len(group)} amostras. Removendo outliers...")
            if old_len > 200.000:
              threshold = 1
            for feature in features_with_outlines:
                group = remove_outliers_zscore(group, feature, threshold)
                new_len = len(group)
                if new_len == 0:
                  '''or  new_len < len(old_group)/2.3 :'''
                  group = old_group
                  new_len = len(group)
                print(
                    f"    Feature: {feature}, Tamanho antes: {old_len}, Tamanho depois: {new_len}, "
                    f"Removidos: {old_len - new_len}"
                )

        # Adicionar os dados limpos ao DataFrame final
        df_cleaned = pd.concat([df_cleaned, group], ignore_index=True)

    return df_cleaned

# Aplicar a remoção de outliers
dados_without_outlines = apply_outlier_removal_conditional(dados, features_with_outlines)


Classe 'BENIGN' possui 2096134 amostras. Removendo outliers...
    Feature: Flow Bytes/s, Tamanho antes: 2096134, Tamanho depois: 2096134, Removidos: 0
    Feature:  Flow Packets/s, Tamanho antes: 2096134, Tamanho depois: 2096134, Removidos: 0
    Feature: Fwd Packets/s, Tamanho antes: 2096134, Tamanho depois: 1989091, Removidos: 107043
    Feature:  Bwd Packets/s, Tamanho antes: 2096134, Tamanho depois: 1887418, Removidos: 208716
    Feature:  Flow Duration, Tamanho antes: 2096134, Tamanho depois: 1647735, Removidos: 448399
    Feature:  Flow IAT Mean, Tamanho antes: 2096134, Tamanho depois: 1566930, Removidos: 529204
    Feature:  Flow IAT Std, Tamanho antes: 2096134, Tamanho depois: 1410103, Removidos: 686031
    Feature:  Flow IAT Max, Tamanho antes: 2096134, Tamanho depois: 1355783, Removidos: 740351
    Feature:  Flow IAT Min, Tamanho antes: 2096134, Tamanho depois: 1263438, Removidos: 832696
    Feature: Fwd IAT Total, Tamanho antes: 2096134, Tamanho depois: 1189386, Removidos: 

In [None]:
dados_without_outlines.shape

(1998720, 79)

In [None]:
dados_without_outlines[" Label"].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
BENIGN,1737495
PortScan,90819
DDoS,84361
DoS Hulk,51851
DoS GoldenEye,10286
FTP-Patator,5933
DoS slowloris,5385
DoS Slowhttptest,5228
SSH-Patator,3219
Bot,1953


In [None]:
dados_without_outlines

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.000000,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.000000,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1998715,80,5029253,9,7,1675,5647,559,0,186.111111,228.375483,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Web Attack � XSS
1998716,80,5013370,7,6,740,7037,382,0,105.714286,180.674398,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Web Attack � XSS
1998717,80,18,1,1,0,0,0,0,0.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Web Attack � XSS
1998718,80,13,1,1,0,0,0,0,0.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Web Attack � XSS


In [None]:
dados = dados_without_outlines

## Pegando o Label

In [None]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Fit the encoder on the 'attack' column and transform it
dados[' Label'] = label_encoder.fit_transform(dados[' Label'])
label = dados[' Label']
dados.drop(columns=[" Label"], inplace=True)


In [None]:
label.value_counts()#979689

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,1737495
10,90819
2,84361
4,51851
3,10286
7,5933
6,5385
5,5228
11,3219
1,1953


###Separação das Variáveis Numéricas e Categoricas

In [None]:
# Separar os dados numéricos e categóricos
dados.reset_index(drop=True)
dados_numericos = dados.select_dtypes(include=["int64", "float64"])
dados_categoricos = dados.select_dtypes(include=["object"])

In [None]:
# Resetar os índices antes de concatenar
dados_numericos.reset_index(drop=True, inplace=True)
dados_categoricos.reset_index(drop=True, inplace=True)
label.reset_index(drop=True, inplace=True)
dados_numericos.shape, dados_categoricos.shape, label.shape

((1998720, 78), (1998720, 0), (1998720,))

3. # Transformação e `Normalização`


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Replace infinite values with NaN
dados_numericos.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute NaN values (if any) using a suitable strategy
# For example, using the mean:
imputer = SimpleImputer(strategy='mean')
dados_numericos = pd.DataFrame(imputer.fit_transform(dados_numericos), columns=dados_numericos.columns)

# Now apply MinMaxScaler
scaler = MinMaxScaler()
dados_numericos_normalizados = pd.DataFrame(scaler.fit_transform(dados_numericos), columns=dados_numericos.columns)

In [None]:
dados_numericos_normalizados

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,0.001221,3.193417e-04,0.000000,0.000003,0.000002,9.153974e-09,0.000242,0.002581,0.001010,0.000000,...,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.005936,4.100000e-06,0.000046,0.000017,0.000060,4.973659e-07,0.003183,0.000000,0.002632,0.004461,...,0.000019,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.001343,9.233334e-06,0.000041,0.000021,0.001099,4.805836e-06,0.063457,0.000000,0.053023,0.089732,...,0.000014,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.005936,1.268250e-04,0.000073,0.000041,0.001204,1.016091e-05,0.052901,0.000000,0.034180,0.060399,...,0.000047,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.001343,9.208334e-06,0.000036,0.000021,0.001099,4.808888e-06,0.063457,0.000000,0.058914,0.098519,...,0.000009,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1998715,0.001221,4.191055e-02,0.000036,0.000024,0.000584,8.615415e-06,0.022522,0.000000,0.031327,0.032396,...,0.000019,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998716,0.001221,4.177819e-02,0.000027,0.000021,0.000258,1.073609e-05,0.015391,0.000000,0.017794,0.025630,...,0.000009,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998717,0.001221,2.583334e-07,0.000000,0.000003,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998718,0.001221,2.166667e-07,0.000000,0.000003,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


4. # Codificação de Variáveis Categóricas

In [None]:
for col in dados_categoricos.columns:
    dados_categoricos[col] = dados_categoricos[col].astype(str)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Aplicar Label Encoding em cada coluna categórica
for col in dados_categoricos.columns:
    label_encoder1 = LabelEncoder()
    dados_categoricos[col] = label_encoder1.fit_transform(dados_categoricos[col])


In [None]:
dados_numericos_normalizados.shape, dados_categoricos.shape, label.shape

((1998720, 78), (1998720, 0), (1998720,))

### Recombinar os dados

In [None]:
#Combinar tudo
dados_normalizados = pd.concat([dados_numericos_normalizados, dados_categoricos, label], axis=1)

In [None]:
dados_normalizados

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.001221,3.193417e-04,0.000000,0.000003,0.000002,9.153974e-09,0.000242,0.002581,0.001010,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.005936,4.100000e-06,0.000046,0.000017,0.000060,4.973659e-07,0.003183,0.000000,0.002632,0.004461,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.001343,9.233334e-06,0.000041,0.000021,0.001099,4.805836e-06,0.063457,0.000000,0.053023,0.089732,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.005936,1.268250e-04,0.000073,0.000041,0.001204,1.016091e-05,0.052901,0.000000,0.034180,0.060399,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.001343,9.208334e-06,0.000036,0.000021,0.001099,4.808888e-06,0.063457,0.000000,0.058914,0.098519,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1998715,0.001221,4.191055e-02,0.000036,0.000024,0.000584,8.615415e-06,0.022522,0.000000,0.031327,0.032396,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
1998716,0.001221,4.177819e-02,0.000027,0.000021,0.000258,1.073609e-05,0.015391,0.000000,0.017794,0.025630,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
1998717,0.001221,2.583334e-07,0.000000,0.000003,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14
1998718,0.001221,2.166667e-07,0.000000,0.000003,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14


In [None]:
classe_majaritaria = "0"

In [None]:
# prompt: diminua em 60% a ocorrencia da classe 0 e guarde em uma variavel

# Supondo que 'dados_normalizados' e 'label' estejam definidos como no código fornecido.

# Encontre o número de amostras da classe majoritária (0)
num_classe_0 = dados_normalizados[dados_normalizados[' Label'] == 0].shape[0]

# Calcule o número de amostras a serem mantidas (40%)
num_to_keep = int(0.20 * num_classe_0)

# Obtenha um DataFrame com as amostras aleatórias da classe 0
reduced_class_0 = dados_normalizados[dados_normalizados[' Label'] == 0].sample(n=num_to_keep, random_state=42)

# Remova as linhas correspondentes da classe 0 do DataFrame original
dados_normalizados = dados_normalizados.drop(dados_normalizados[dados_normalizados[' Label'] == 0].index)


# Concatene o DataFrame reduzido com o restante do DataFrame
dados_normalizados = pd.concat([dados_normalizados, reduced_class_0], axis=0)

# Embaralhe o DataFrame resultante
dados_normalizados = dados_normalizados.sample(frac=1, random_state=42).reset_index(drop=True)

### Separar dados e label

In [None]:
X = dados_normalizados.drop(columns=[" Label"])
y = dados_normalizados[" Label"]

In [None]:
y.value_counts(), y.shape

( Label
 0     694998
 10     90819
 2      84361
 4      51851
 3      10286
 7       5933
 6       5385
 5       5228
 11      3219
 1       1953
 12      1470
 14       652
 9         36
 13        21
 8         11
 Name: count, dtype: int64,
 (956223,))

# Divisão do dataset
* ### 60%, 40%

In [None]:
# Divida o conjunto inicial de dados em treino (60%) e o restante (40%) para validação e teste
X_treino, X_validacao, y_treino, y_validacao = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Verifique os tamanhos
print("Tamanho do conjunto de treino:", len(X_treino), " | ",(len(X)/100)*60)
print("Tamanho do conjunto de validação:", len(X_validacao), " | ",(len(X)/100)*20)

Tamanho do conjunto de treino: 573733  |  573733.7999999999
Tamanho do conjunto de validação: 382490  |  191244.59999999998


### Melhorando o treinamento

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# ... (your existing code) ...

# Obter a distribuição atual das classes
sampling_strategy ={
         8: 500,
         9: 500,
         13: 500,
         14: 1000
}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_treino, y_treino = smote.fit_resample(X_treino, y_treino)
''' {8.0: 913505, 6.0: 13812, 7.0: 12532, 0.0: 10016, 1.0: 7127, 5.0: 2753,
3.0: 1193, 4.0: 1010, 2.0: 819, 9.0: 93} '''
# ... (rest of your code) ...

' {8.0: 913505, 6.0: 13812, 7.0: 12532, 0.0: 10016, 1.0: 7127, 5.0: 2753,\n3.0: 1193, 4.0: 1010, 2.0: 819, 9.0: 93} '

### Criando um relatorio da quantidade das classes em cada tratamento e divisão

In [None]:
import pandas as pd
classes_originais = label_encoder.classes_
# Suponha que y, y_treino, y_validacao e y_teste são Series pandas
# (já com os valores de classe contados anteriormente)

# Contando as classes em cada conjunto
contagem_y = y.value_counts()
contagem_treino = y_treino.value_counts()
contagem_validacao = y_validacao.value_counts()

# Criando um DataFrame com as contagens
df_comparacao = pd.DataFrame({
    'Classe': contagem_y.index,  # Pegando as classes
    'Classe_Original': [classes_originais[int(i)] for i in contagem_y.index],
    "Sem tratamento": oring_label.value_counts().values,
    'Com tratamento': contagem_y.values,
    "Porcemtagem": [f"{(c/(s/100)):.0f}%" for s, c in zip(oring_label.value_counts().values, contagem_y.values)],
    'Treino': contagem_treino.values,
    'Validacao': contagem_validacao.values
})

In [None]:
df_comparacao

Unnamed: 0,Classe,Classe_Original,Sem tratamento,Com tratamento,Porcemtagem,Treino,Validacao
0,0,BENIGN,2273097,694998,31%,416998,278000
1,10,PortScan,231073,90819,39%,54491,36328
2,2,DDoS,158930,84361,53%,50616,33745
3,4,DoS Hulk,128027,51851,41%,31110,20741
4,3,DoS GoldenEye,10293,10286,100%,6172,4114
5,7,FTP-Patator,7938,5933,75%,3560,2373
6,6,DoS slowloris,5897,5385,91%,3231,2154
7,5,DoS Slowhttptest,5796,5228,90%,3137,2091
8,11,SSH-Patator,5499,3219,59%,1931,1288
9,1,Bot,1966,1953,99%,1172,781


In [None]:
params = {
    "um": {
      "objective": "multi:softmax",  # Classificação multiclasse
      "num_class": len(set(y)),  # Número de classes
      "max_depth": 6,  # Profundidade máxima da árvore
      "eta": 0.1,  # Taxa de aprendizado
      "subsample": 0.8,  # Amostragem de linhas
      "colsample_bytree": 0.8,  # Amostragem de colunas
      "scale_pos_weight": 1,  # Ajustar com base no desbalanceamento
      "eval_metric": "mlogloss",  # Métrica de avaliação
    }
}

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier( objective='multi:softmax',
                      eval_metric='mlogloss',
                      num_class=len(y.unique()),
                      learning_rate=0.1,
                      n_estimators=200,
                       max_depth=6,
                       min_child_weight=3,
                       gamma=0.1,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       scale_pos_weight=1,
                       reg_alpha=0.1,
                       reg_lambda=1,
                       tree_method='hist',
                       random_state=42)

In [None]:
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight='balanced', y=y_treino)
model.fit(X_treino, y_treino, sample_weight=sample_weights)

In [None]:
# Importância das features
importances = model.feature_importances_
features = X.columns

# Mostrar as 10 features mais importantes
feature_importance = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)
for feature, importance in feature_importance[:10]:
    print(f"{feature}: {importance:.4f}")

In [None]:
# Make predictions on the validation set
y_pred_t = model.predict(X_treino)

# Print the results
print(classification_report(y_treino, y_pred))
print(confusion_matrix(y_treino, y_pred))
print("Accuracy:", accuracy_score(y_treino, y_pred))

In [None]:
# Make predictions on the validation set
y_pred = model.predict(X_validacao)

# Print the results
print(classification_report(y_validacao, y_pred))
print(confusion_matrix(y_validacao, y_pred))
print("Accuracy:", accuracy_score(y_validacao, y_pred))