In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [8]:
# Đường dẫn đến thư mục chứa data
data_path = Path(r'c:\Users\Tplab\OneDrive\CNN tutorial\data\raw')

# Lấy danh sách tất cả file CSV
csv_files = sorted(list(data_path.glob('*.csv')))
print(f"Tổng số file CSV: {len(csv_files)}")
print(f"\nDanh sách 5 file đầu tiên:")
for f in csv_files[:5]:
    print(f"  - {f.name}")

Tổng số file CSV: 75

Danh sách 5 file đầu tiên:
  - UNSW_2018_IoT_Botnet_Dataset_1.csv
  - UNSW_2018_IoT_Botnet_Dataset_10.csv
  - UNSW_2018_IoT_Botnet_Dataset_11.csv
  - UNSW_2018_IoT_Botnet_Dataset_12.csv
  - UNSW_2018_IoT_Botnet_Dataset_13.csv


## 1. Đọc file đầu tiên để xem cấu trúc

In [14]:
# Đọc file đầu tiên (không có header)
# Dựa vào documentation của UNSW Bot-IoT dataset
column_names = [
    'pkSeqID', 'stime', 'flgs', 'proto', 'saddr', 'sport', 'daddr', 'dport',
    'pkts', 'bytes', 'state', 'ltime', 'seq', 'dur', 'mean', 'stddev',
    'smac', 'dmac', 'sum', 'min', 'max', 'soui', 'doui', 'sco', 'dco',
    'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'srate', 'drate',
    'attack', 'category', 'subcategory'
]

df_sample = pd.read_csv(csv_files[0], header=None, names=column_names, low_memory=False)

print(f"Shape: {df_sample.shape}")
print(f"\nColumns ({len(df_sample.columns)}):")
print(df_sample.columns.tolist())

Shape: (1000000, 35)

Columns (35):
['pkSeqID', 'stime', 'flgs', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'pkts', 'bytes', 'state', 'ltime', 'seq', 'dur', 'mean', 'stddev', 'smac', 'dmac', 'sum', 'min', 'max', 'soui', 'doui', 'sco', 'dco', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'srate', 'drate', 'attack', 'category', 'subcategory']


In [15]:
# Xem 5 dòng đầu
df_sample.head()

Unnamed: 0,pkSeqID,stime,flgs,proto,saddr,sport,daddr,dport,pkts,bytes,state,ltime,seq,dur,mean,stddev,smac,dmac,sum,min,max,soui,doui,sco,dco,spkts,dpkts,sbytes,dbytes,rate,srate,drate,attack,category,subcategory
0,1,1526344000.0,e,arp,192.168.100.1,,192.168.100.3,,4,240,CON,1526345000.0,9,1195.996582,6e-06,2e-06,,,1.1e-05,4e-06,7e-06,,,,,2,2,120,120,0.002508,0.000836,0.000836,0,Normal,Normal
1,2,1526344000.0,e,tcp,192.168.100.7,139.0,192.168.100.4,36390.0,10,680,CON,1526346000.0,10,1453.945923,2.8e-05,8e-06,,,0.000138,2.2e-05,4.2e-05,,,,,5,5,350,330,0.00619,0.002751,0.002751,0,Normal,Normal
2,3,1526344000.0,e,udp,192.168.100.149,51838.0,27.124.125.250,123.0,2,180,CON,1526344000.0,11,0.048565,0.048565,0.0,,,0.048565,0.048565,0.048565,,,,,1,1,90,90,20.59096,0.0,0.0,0,Normal,Normal
3,4,1526344000.0,e,arp,192.168.100.4,,192.168.100.7,,10,510,CON,1526346000.0,12,1454.080322,0.000238,2.2e-05,,,0.001189,0.000199,0.000261,,,,,5,5,210,300,0.006189,0.002751,0.002751,0,Normal,Normal
4,5,1526344000.0,e,udp,192.168.100.27,58999.0,192.168.100.1,53.0,4,630,CON,1526345000.0,14,569.93396,0.098505,0.08015,,,0.197011,0.018356,0.178655,,,,,2,2,174,456,0.005264,0.001755,0.001755,0,Normal,Normal


In [11]:
# Đọc 10 dòng đầu tiên để xem cấu trúc
with open(csv_files[0], 'r') as f:
    for i in range(10):
        print(f"Line {i+1}: {f.readline().strip()}")

Line 1: 1,1526344121.188091,"e","arp","192.168.100.1","","192.168.100.3","",4,240,"CON",1526345317.184693,9,1195.996582,0.000006,0.000002,"","",0.000011,0.000004,0.000007,"","","","",2,2,120,120,0.002508,0.000836,0.000836,0,"Normal","Normal"
Line 2: 2,1526344223.197482,"e","tcp","192.168.100.7","139","192.168.100.4","36390",10,680,"CON",1526345677.143407,10,1453.945923,0.000028,0.000008,"","",0.000138,0.000022,0.000042,"","","","",5,5,350,330,0.00619,0.002751,0.002751,0,"Normal","Normal"
Line 3: 3,1526344227.029374,"e","udp","192.168.100.149","51838","27.124.125.250","123",2,180,"CON",1526344227.077939,11,0.048565,0.048565,0,"","",0.048565,0.048565,0.048565,"","","","",1,1,90,90,20.59096,0,0,0,"Normal","Normal"
Line 4: 4,1526344228.312317,"e","arp","192.168.100.4","","192.168.100.7","",10,510,"CON",1526345682.392620,12,1454.080322,0.000238,0.000022,"","",0.001189,0.000199,0.000261,"","","","",5,5,210,300,0.006189,0.002751,0.002751,0,"Normal","Normal"
Line 5: 5,1526344302.636337,"e","ud

In [16]:
# Info về data types và missing values
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 35 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   pkSeqID      1000000 non-null  int64  
 1   stime        1000000 non-null  float64
 2   flgs         1000000 non-null  object 
 3   proto        1000000 non-null  object 
 4   saddr        1000000 non-null  object 
 5   sport        999513 non-null   object 
 6   daddr        1000000 non-null  object 
 7   dport        999513 non-null   object 
 8   pkts         1000000 non-null  int64  
 9   bytes        1000000 non-null  int64  
 10  state        1000000 non-null  object 
 11  ltime        1000000 non-null  float64
 12  seq          1000000 non-null  int64  
 13  dur          1000000 non-null  float64
 14  mean         1000000 non-null  float64
 15  stddev       1000000 non-null  float64
 16  smac         0 non-null        float64
 17  dmac         0 non-null        float64
 18  sum

In [13]:
# Thống kê mô tả
df_sample.describe()

Unnamed: 0,1,1526344121.188091,4,240,1526345317.184693,9,1195.996582,0.000006,0.000002,Unnamed: 16,Unnamed: 17,0.000011,0.000004,0.000007,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,2,2.1,120,120.1,0.002508,0.000836,0.000836.1,0
count,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,0.0,0.0,999999.0,999999.0,999999.0,0.0,0.0,0.0,0.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0
mean,500001.0,1526826000.0,8.025129,5156.976,1526826000.0,98125.445121,1.904934,0.035261,0.00309,,,0.313766,0.031952,0.03838,,,,,5.393687,2.631442,3539.55,1617.426,6539.694,232.299948,214.119468,0.998008
std,288674.990255,194061.1,494.351004,478105.3,194057.7,85446.170835,41.975285,0.203744,0.076291,,,19.749911,0.185281,0.243284,,,,,331.263787,212.018296,305485.3,212060.5,33538.04,2971.116545,2145.512436,0.044587
min,2.0,1526344000.0,1.0,60.0,1526344000.0,1.0,0.0,0.0,0.0,,,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,250001.5,1526881000.0,2.0,120.0,1526881000.0,15573.0,0.000207,0.0002,0.0,,,0.000201,0.000195,0.0002,,,,,1.0,1.0,60.0,60.0,24.75186,0.0,0.0,1.0
50%,500001.0,1526899000.0,2.0,120.0,1526899000.0,81392.0,0.012066,0.011737,0.0,,,0.011767,0.011532,0.011764,,,,,1.0,1.0,60.0,60.0,53.68839,0.0,0.0,1.0
75%,750000.5,1526899000.0,2.0,120.0,1526899000.0,164726.0,0.035779,0.035538,0.0,,,0.03555,0.035352,0.035548,,,,,1.0,1.0,60.0,60.0,1508.296,0.0,0.0,1.0
max,1000000.0,1526949000.0,72700.0,73510810.0,1526950000.0,283299.0,1940.859253,4.968881,2.499505,,,1913.193604,4.953428,4.999999,,,,,38081.0,36350.0,37747020.0,36958760.0,1250000.0,1000000.0,500000.0,1.0


## 2. Kiểm tra target label (nếu có)

In [17]:
# Tìm cột có thể là label (thường là 'label', 'attack', 'category', etc.)
potential_labels = [col for col in df_sample.columns if any(keyword in col.lower() 
                    for keyword in ['label', 'attack', 'category', 'class', 'type'])]

print("Các cột có thể là label:")
print(potential_labels)

if potential_labels:
    for col in potential_labels:
        print(f"\n{col}:")
        print(df_sample[col].value_counts())

Các cột có thể là label:
['attack', 'category', 'subcategory']

attack:
attack
1    998007
0      1993
Name: count, dtype: int64

category:
category
Reconnaissance    998007
Normal              1993
Name: count, dtype: int64

subcategory:
subcategory
Service_Scan    998007
Normal            1993
Name: count, dtype: int64


## 3. Kiểm tra missing values

In [18]:
# Missing values
missing = df_sample.isnull().sum()
missing_percent = (missing / len(df_sample)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

print("Các cột có missing values:")
print(missing_df)

Các cột có missing values:
       Missing Count  Percentage
smac         1000000    100.0000
dmac         1000000    100.0000
soui         1000000    100.0000
doui         1000000    100.0000
sco          1000000    100.0000
dco          1000000    100.0000
sport            487      0.0487
dport            487      0.0487


## 4. Kiểm tra xem tất cả files có cùng structure không

In [None]:
# Kiểm tra 5 files đầu tiên
print("Kiểm tra columns của 5 files đầu tiên:\n")

first_cols = set(pd.read_csv(csv_files[0], nrows=0).columns)
all_same = True

for i, file in enumerate(csv_files[:5]):
    df_temp = pd.read_csv(file, nrows=0)
    cols = set(df_temp.columns)
    
    if cols == first_cols:
        print(f"✓ {file.name}: {len(cols)} columns - SAME")
    else:
        print(f"✗ {file.name}: {len(cols)} columns - DIFFERENT")
        all_same = False
        
print(f"\nTất cả files có cùng structure: {all_same}")

## 5. Ước tính tổng số rows khi merge tất cả files

In [None]:
# Đếm rows của 10 files đầu để ước tính
total_rows_sample = 0
for file in csv_files[:10]:
    df_temp = pd.read_csv(file)
    total_rows_sample += len(df_temp)
    print(f"{file.name}: {len(df_temp):,} rows")

avg_rows = total_rows_sample / 10
estimated_total = avg_rows * len(csv_files)

print(f"\nTrung bình mỗi file: {avg_rows:,.0f} rows")
print(f"Ước tính tổng số rows khi merge {len(csv_files)} files: {estimated_total:,.0f} rows")