In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the full parquet dataset
data_path = '/home/tauhid/llm_network_intrusion/dataset/iot_dataset/iot_network_flow_data_benign_mirai_bruteforce_spoofing.parquet'
df = pd.read_parquet(data_path)

# Label binarization: 0 = benign, 1 = attack
df['label'] = df['label'].apply(lambda x: 0 if x.lower().startswith('Benign') else 1)

# Train-test split (80%-20%)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Create text column from all features except label
df_train['text'] = df_train.drop(columns='label').astype(str).agg(', '.join, axis=1)
df_test['text'] = df_test.drop(columns='label').astype(str).agg(', '.join, axis=1)

# Overview
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
print("\nTrain label distribution:\n", df_train['label'].value_counts(normalize=True).round(2))
print("\nTest label distribution:\n", df_test['label'].value_counts(normalize=True).round(2))

# Show a few rows
df_train.head()

Train shape: (527135, 85)
Test shape: (131784, 85)

Train label distribution:
 label
0    0.6
1    0.4
Name: proportion, dtype: float64

Test label distribution:
 label
0    0.6
1    0.4
Name: proportion, dtype: float64


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label,text
367756,192.168.137.133-47.112.162.49-26118-32100-17,192.168.137.133,26118,47.112.162.49,32100,17,08/10/2022 04:20:25 PM,88064163,3,3,...,299486.0,1890.804,300823.0,298149.0,43579758.0,1578.262,43580874.0,43578642.0,0,"192.168.137.133-47.112.162.49-26118-32100-17, ..."
539929,192.168.137.184-3.141.175.104-55831-8006-17,192.168.137.184,55831,3.141.175.104,8006,17,16/01/2023 09:20:03 AM,80270262,3,3,...,48263.0,2965.606,50360.0,46166.0,40065752.0,5628.57,40069732.0,40061772.0,1,"192.168.137.184-3.141.175.104-55831-8006-17, 1..."
529413,192.168.137.163-92.118.63.74-35342-123-17,192.168.137.163,35342,92.118.63.74,123,17,13/01/2023 12:36:17 PM,313057,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"192.168.137.163-92.118.63.74-35342-123-17, 192..."
525886,192.168.137.17-13.59.192.161-32874-443-6,192.168.137.17,32874,13.59.192.161,443,6,13/01/2023 12:41:05 PM,97701838,26,35,...,6302537.0,11135110.0,22992248.0,129531.0,12054310.5,5155327.0,20352116.0,6002529.0,1,"192.168.137.17-13.59.192.161-32874-443-6, 192...."
650822,192.168.137.162-239.255.255.250-37563-1900-17,192.168.137.162,37563,239.255.255.250,1900,17,12/01/2023 03:31:09 PM,5704911,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"192.168.137.162-239.255.255.250-37563-1900-17,..."
