In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## 1. Load All CSV Files

In [2]:
# Đường dẫn data
data_path = Path(r'c:\Users\Tplab\OneDrive\CNN tutorial\data\raw')
csv_files = sorted(list(data_path.glob('*.csv')))

print(f"Tổng số file: {len(csv_files)}")

# Tên cột
column_names = [
    'pkSeqID', 'stime', 'flgs', 'proto', 'saddr', 'sport', 'daddr', 'dport',
    'pkts', 'bytes', 'state', 'ltime', 'seq', 'dur', 'mean', 'stddev',
    'smac', 'dmac', 'sum', 'min', 'max', 'soui', 'doui', 'sco', 'dco',
    'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'srate', 'drate',
    'attack', 'category', 'subcategory'
]

Tổng số file: 75


In [None]:
# Merge 20 files đầu tiên (để tránh tràn RAM)
# Có thể tăng lên sau khi test xong
NUM_FILES = 20

print(f"Đang merge {NUM_FILES} files đầu tiên...")
df_list = []

for i, file in enumerate(csv_files[:NUM_FILES], 1):
    df_temp = pd.read_csv(file, header=None, names=column_names, low_memory=False)
    df_list.append(df_temp)
    print(f"Đã load {i}/{NUM_FILES} files...")
    
    # Giải phóng memory ngay sau khi append
    del df_temp

# Concat all dataframes
print("\nĐang concat dataframes...")
df = pd.concat(df_list, ignore_index=True)

# Giải phóng df_list
del df_list

print(f"\nMerge hoàn tất!")
print(f"Total shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

## 2. Data Cleaning

In [None]:
# Xóa các cột hoàn toàn rỗng
columns_to_drop = ['smac', 'dmac', 'soui', 'doui', 'sco', 'dco']
df = df.drop(columns=columns_to_drop)

print(f"Đã xóa {len(columns_to_drop)} cột rỗng")
print(f"Shape sau khi xóa: {df.shape}")

NameError: name 'df' is not defined

In [None]:
# Kiểm tra duplicates
print(f"Số dòng trùng lặp: {df.duplicated().sum():,}")

# Xóa duplicates
df = df.drop_duplicates()
print(f"Shape sau khi xóa duplicates: {df.shape}")

In [None]:
# Handle missing values trong sport và dport
print("Missing values trước khi xử lý:")
print(df[['sport', 'dport']].isnull().sum())

# Fill missing với giá trị đặc biệt (0 hoặc 'unknown')
df['sport'] = df['sport'].fillna('0')
df['dport'] = df['dport'].fillna('0')

print("\nMissing values sau khi xử lý:")
print(df[['sport', 'dport']].isnull().sum())

## 3. Analyze Label Distribution

In [None]:
# Phân bố attack/normal
print("Attack distribution:")
print(df['attack'].value_counts())
print(f"\nAttack ratio: {df['attack'].value_counts(normalize=True)*100}")

print("\n" + "="*50)
print("Category distribution:")
print(df['category'].value_counts())

print("\n" + "="*50)
print("Subcategory distribution:")
print(df['subcategory'].value_counts())

In [None]:
# Visualize distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Attack
df['attack'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Attack Distribution (0=Normal, 1=Attack)')
axes[0].set_ylabel('Count')

# Category
df['category'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Category Distribution')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Subcategory (top 10)
df['subcategory'].value_counts().head(10).plot(kind='bar', ax=axes[2])
axes[2].set_title('Top 10 Subcategory Distribution')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Feature Engineering

In [None]:
# Xác định các cột cần encode
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
# Loại bỏ label columns khỏi categorical
categorical_cols = [col for col in categorical_cols if col not in ['category', 'subcategory']]

print(f"Categorical columns cần encode: {categorical_cols}")

In [None]:
# Label encoding cho categorical features
from sklearn.preprocessing import LabelEncoder

le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

In [None]:
# Label encoding cho target
le_category = LabelEncoder()
le_subcategory = LabelEncoder()

df['category_encoded'] = le_category.fit_transform(df['category'])
df['subcategory_encoded'] = le_subcategory.fit_transform(df['subcategory'])

print("Category mapping:")
for i, cat in enumerate(le_category.classes_):
    print(f"  {i}: {cat}")

print("\nSubcategory mapping:")
for i, subcat in enumerate(le_subcategory.classes_):
    print(f"  {i}: {subcat}")

## 5. Save Processed Dataset

In [None]:
# Lưu dataset đã xử lý
output_path = Path(r'c:\Users\Tplab\OneDrive\CNN tutorial\data\processed')
output_path.mkdir(exist_ok=True)

output_file = output_path / 'bot_iot_processed.csv'
df.to_csv(output_file, index=False)

print(f"Đã lưu dataset vào: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024**3:.2f} GB")
print(f"Final shape: {df.shape}")

In [None]:
# Lưu label encoders để dùng sau này
import pickle

encoders = {
    'categorical': le_dict,
    'category': le_category,
    'subcategory': le_subcategory
}

with open(output_path / 'label_encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

print("Đã lưu label encoders!")

## 6. Dataset Summary

In [None]:
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
print(f"Total samples: {len(df):,}")
print(f"Total features: {df.shape[1]}")
print(f"\nAttack distribution:")
print(df['attack'].value_counts())
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
print(f"\nColumns: {df.columns.tolist()}")