In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import pickle

In [2]:
Train_Data = pd.read_csv("/content/drive/MyDrive/KDDTrain+.txt" , sep = "," , encoding = 'utf-8')
Test_Data  = pd.read_csv("/content/drive/MyDrive/KDDTest+.txt" , sep = "," , encoding = 'utf-8')

In [3]:
Columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
            'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
            'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
            'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
            'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
            'dst_host_srv_rerror_rate','attack','level'])

In [4]:
Train_Data.columns = Columns
Test_Data.columns = Columns

In [5]:
# Bỏ cột level vì không cần dùng
Train_Data = Train_Data.drop('level', axis = 1)
Test_Data = Test_Data.drop('level', axis = 1)

In [6]:
# Ánh xạ sub-class sang class ở dạng số
attack_mapping = {
    # normal
    'normal': 0,
    # DDos
    'back': 1, 'land': 1, 'neptune': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,
    'apache2': 1, 'udpstorm': 1, 'processtable': 1, 'mailbomb': 1, 'worm': 1,
    # Probe
    'satan': 2, 'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'mscan': 2, 'saint': 2,
    # U2R
    'buffer_overflow': 3, 'loadmodule': 3, 'perl': 3, 'rootkit': 3,
    'sqlattack': 3, 'xterm': 3, 'ps': 3,
    # R2L
    'ftp_write': 4, 'guess_passwd': 4, 'imap': 4, 'multihop': 4, 'phf': 4, 'spy': 4,
    'warezclient': 4, 'warezmaster': 4, 'snmpgetattack': 4, 'named': 4, 'xlock': 4,
    'xsnoop': 4, 'sendmail': 4, 'httptunnel': 4, 'snmpguess': 4
}

Train_Data['attack'] = Train_Data['attack'].map(attack_mapping)
Test_Data['attack'] = Test_Data['attack'].map(attack_mapping)

In [7]:
Train_Data[:5]

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
1,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
2,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
3,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1


In [8]:
# Kiểu dữ liệu của column 'attack' ở dạng int64, chứng tỏ tất cả đã được convert thành công
Train_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125972 entries, 0 to 125971
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125972 non-null  int64  
 1   protocol_type                125972 non-null  object 
 2   service                      125972 non-null  object 
 3   flag                         125972 non-null  object 
 4   src_bytes                    125972 non-null  int64  
 5   dst_bytes                    125972 non-null  int64  
 6   land                         125972 non-null  int64  
 7   wrong_fragment               125972 non-null  int64  
 8   urgent                       125972 non-null  int64  
 9   hot                          125972 non-null  int64  
 10  num_failed_logins            125972 non-null  int64  
 11  logged_in                    125972 non-null  int64  
 12  num_compromised              125972 non-null  int64  
 13 

In [9]:
Test_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22543 entries, 0 to 22542
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     22543 non-null  int64  
 1   protocol_type                22543 non-null  object 
 2   service                      22543 non-null  object 
 3   flag                         22543 non-null  object 
 4   src_bytes                    22543 non-null  int64  
 5   dst_bytes                    22543 non-null  int64  
 6   land                         22543 non-null  int64  
 7   wrong_fragment               22543 non-null  int64  
 8   urgent                       22543 non-null  int64  
 9   hot                          22543 non-null  int64  
 10  num_failed_logins            22543 non-null  int64  
 11  logged_in                    22543 non-null  int64  
 12  num_compromised              22543 non-null  int64  
 13  root_shell      

In [10]:
Train_Data.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
count,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,...,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0
mean,287.146929,45567.1,19779.27,0.000198,0.022688,0.000111,0.204411,0.001222,0.395739,0.279253,...,115.653725,0.521244,0.082952,0.148379,0.032543,0.284455,0.278487,0.118832,0.120241,0.582471
std,2604.525522,5870354.0,4021285.0,0.014086,0.253531,0.014366,2.149977,0.045239,0.489011,23.942137,...,110.702886,0.44895,0.188922,0.308998,0.112564,0.444785,0.44567,0.306559,0.31946,0.724927
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0,1.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


In [11]:
# Kiem tra du lieu bi thieu
Train_Data.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [12]:
Test_Data.isnull().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [13]:
# Thực hiện encode các giá trị trong các cột "columns_to_encode" vì ở dạng text
# Danh sách các cột để mã hóa
columns_to_encode = ['protocol_type', 'service', 'flag']

# Đối với mỗi cột, kết hợp dữ liệu từ cả tập huấn luyện và tập kiểm thử để khớp LabelEncoder
for column in columns_to_encode:
    le = LabelEncoder()

    # Kết hợp dữ liệu từ cả hai tập
    combined_data = pd.concat([Train_Data[column], Test_Data[column]])

    # Khớp LabelEncoder trên dữ liệu kết hợp
    le.fit(combined_data)

    # Biến đổi dữ liệu trong tập huấn luyện và tập kiểm thử
    Train_Data[column] = le.transform(Train_Data[column])
    Test_Data[column] = le.transform(Test_Data[column])


In [14]:
# Dữ liệu sau khi encode
Train_Data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,2,44,9,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
1,0,1,49,5,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
2,0,1,24,9,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
3,0,1,24,9,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,1,49,1,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1


In [15]:
# Min-max normalizatin

# Khởi tạo MinMaxScaler
scaler = MinMaxScaler()

# Chọn tất cả các cột dữ liệu trừ cột nhãn cuối cùng
features = Train_Data.columns[:-1]  # Lấy tất cả các cột trừ cột cuối cùng

# Khớp MinMaxScaler chỉ trên các cột đặc trưng của tập huấn luyện
scaler.fit(Train_Data[features])

# Biến đổi cả tập huấn luyện và tập kiểm thử trên các cột đã chọn
Train_Data[features] = scaler.transform(Train_Data[features])
Test_Data[features] = scaler.transform(Test_Data[features])

In [16]:
Train_Data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0.0,1.0,0.637681,0.9,1.057999e-07,0.0,0.0,0.0,0.0,0.0,...,0.003922,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.5,0.710145,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.101961,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
2,0.0,0.5,0.347826,0.9,1.681203e-07,6.223962e-06,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
3,0.0,0.5,0.347826,0.9,1.442067e-07,3.20626e-07,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.5,0.710145,0.1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07451,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1


In [17]:
# Thống kê số lượng mẫu mỗi lớp
Train_Data['attack'].value_counts()

attack
0    67342
1    45927
2    11656
4      995
3       52
Name: count, dtype: int64

In [18]:
Test_Data['attack'].value_counts()

attack
0    9711
1    7459
4    2885
2    2421
3      67
Name: count, dtype: int64

In [19]:
# Chia thành features và labels
X_train = Train_Data.drop('attack', axis = 1)
X_test = Test_Data.drop('attack', axis = 1)
y_train = Train_Data['attack']
y_test = Test_Data['attack']

In [20]:
# Lưu dữ liệu vào file Pickle
with open('nsl_kdd_preprocessed.pkl', 'wb') as file:
    pickle.dump((X_train, X_test, y_train, y_test), file)