In [127]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [128]:
# Load and reduce pseudo logs
pseudo_logs = pd.read_csv("../datasets/Vijaya_Pseudo_Logs.csv").sample(100, random_state=42)
pseudo_logs.to_csv("../datasets/Vijaya_Pseudo_Logs_reduced.csv", index=False)

In [139]:
# Load CIC-IDS dataset
cic = pd.read_csv("../datasets/CIC_IDS_2017/Friday-WorkingHours-Morning.pcap_ISCX.csv").sample(300, random_state=42)

# Create label column
cic['label'] = cic[' Label'].apply(lambda x: 'Attack' if str(x).strip().upper() != 'BENIGN' else 'Normal')

# Create synthetic network data (using available columns)
processed_cic = pd.DataFrame({
    'timestamp': pd.to_datetime('2023-01-01') + pd.to_timedelta(cic[' Flow Duration']//1000, unit='s'),
    'source_ip': '192.168.' + (cic[' Destination Port']//100).astype(str) + '.' + (cic[' Flow Duration']%100).astype(str),
    'destination_ip': '10.0.' + (cic[' Total Fwd Packets']%100).astype(str) + '.' + (cic[' Total Backward Packets']%100).astype(str),
    'protocol': np.where(cic[' Destination Port'].isin([80, 443]), 'TCP', 'UDP'),  # Simple protocol inference
    'port': cic[' Destination Port'],
    'action': np.where(cic['label'] == 'Attack', 'DENY', 'ALLOW'),
    'label': cic['label']
})

# Save cleaned data
processed_cic.to_csv("../datasets/CIC_IDS_2017_processed.csv", index=False)
print("CIC-IDS processed data saved. Sample:")
print(processed_cic.head())

CIC-IDS processed data saved. Sample:
                 timestamp       source_ip destination_ip protocol   port  \
32220  2023-01-01 00:00:23     192.168.0.8       10.0.2.2      UDP     53   
151693 2023-01-01 00:01:45    192.168.0.86       10.0.1.1      TCP     80   
188814 2023-01-01 00:00:33  192.168.538.60       10.0.2.0      UDP  53891   
38470  2023-01-01 00:03:05     192.168.0.4       10.0.1.1      UDP     53   
75568  2023-01-01 00:00:23    192.168.0.68       10.0.1.1      UDP     53   

       action   label  
32220   ALLOW  Normal  
151693  ALLOW  Normal  
188814  ALLOW  Normal  
38470   ALLOW  Normal  
75568   ALLOW  Normal  


In [141]:
# NSL-KDD column names (adjust based on your dataset)
cols = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
        "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
        "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
        "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", 
        "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
        "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
        "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
        "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
        "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
        "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "attack_type", "label"]

nsl = pd.read_csv("../datasets/NSL_KDD/KDDTrain+.txt", names=cols).sample(300, random_state=42)
nsl = nsl[['protocol_type', 'src_bytes', 'dst_bytes', 'label']]
nsl.columns = ['protocol', 'bytes_sent', 'bytes_received', 'label']
nsl['action'] = np.where(nsl['label'] == 'normal', 'ALLOW', 'DENY')  # Simulate action
# Save cleaned data
nsl.to_csv("../datasets/NSL_KDD_processed.csv", index=False)
print("NSL-KDD processed data saved. Sample:")
print(nsl.head())

NSL-KDD processed data saved. Sample:
      protocol  bytes_sent  bytes_received  label action
378        udp          36               0     21   DENY
32038      tcp           0               0     18   DENY
86399      tcp           0               0     21   DENY
74412      tcp           0               0     19   DENY
52951      tcp           0               0     15   DENY


In [140]:
# Load firewall logs with proper parsing
firewall_logs = pd.read_csv("../datasets/VijayaEnterprises_Firewall_logs.csv", 
                          skiprows=2,
                          header=None,
                          names=['index', 'time', 'empty1', 'message', 'empty2', 
                                 'source', 'empty3', 'destination', 'empty4', 'notes', 'empty5'])

# Clean and extract fields
firewall_logs_clean = pd.DataFrame({
    'timestamp': firewall_logs['time'],
    'source_ip': firewall_logs['source'].str.extract(r'(\d+\.\d+\.\d+\.\d+)')[0].fillna('0.0.0.0'),
    'destination_ip': firewall_logs['destination'].str.extract(r'(\d+\.\d+\.\d+\.\d+)')[0].fillna('0.0.0.0'),
    'protocol': firewall_logs['notes'].str.extract(r'(TCP|UDP|IP)')[0].fillna('UNKNOWN'),
    'port': firewall_logs['source'].str.extract(r', (\d+),')[0].fillna(0).astype(int),
    'action': firewall_logs['message'].str.extract(r'(ALLOW|DENY|DROPPED)', flags=re.IGNORECASE)[0].str.upper().fillna('UNKNOWN'),
    'label': np.where(firewall_logs['message'].str.contains('drop', case=False), 'Attack', 'Normal')
})

# Drop completely empty rows
firewall_logs_clean = firewall_logs_clean.dropna(how='all')

print("\nFinal Firewall processed data:")
print(firewall_logs_clean.head())
print(f"\nLabel distribution:\n{firewall_logs_clean['label'].value_counts()}")
# Save the cleaned firewall data
firewall_logs_clean.to_csv("../datasets/Vijaya_Firewall_processed.csv", index=False)
print("\nFirewall data saved. Label distribution:")
print(firewall_logs_clean['label'].value_counts())


Final Firewall processed data:
  timestamp      source_ip   destination_ip protocol   port   action   label
0   20:22.2        0.0.0.0  255.255.255.255      UDP     68  DROPPED  Attack
1   20:08.8    10.90.90.90  239.255.255.100       IP      0  DROPPED  Attack
2   19:57.2    10.90.90.90  255.255.255.255  UNKNOWN      0  DROPPED  Attack
3   19:56.6  192.168.1.201   203.145.184.32      UDP  45233    ALLOW  Normal
4   19:38.6  169.254.31.13  169.254.255.255  UNKNOWN    137  DROPPED  Attack

Label distribution:
Attack    31
Normal    18
Name: label, dtype: int64

Firewall data saved. Label distribution:
Attack    31
Normal    18
Name: label, dtype: int64


In [144]:
import pandas as pd

# Load merged dataset
final_df = pd.read_csv("../datasets/FINAL_MERGED_DATASET.csv")

# Standardize labels (NSL-KDD uses numbers, others use text)
attack_keywords = ['21', '18', '20', 'IP_Spoofing', 'Brute_Force', 'Port_Scan', 
                  '19', '16', '17', '15', '11', '14', '10', '7', '6']

final_df['label'] = final_df['label'].apply(
    lambda x: 'Attack' if str(x) in attack_keywords or 'Spoof' in str(x) or 'Force' in str(x) or 'Scan' in str(x) 
    else 'Normal'
)

# Verify
print("✅ Standardized label distribution:\n", final_df['label'].value_counts())
final_df.to_csv("../datasets/FINAL_MERGED_DATASET_CLEAN.csv", index=False)

✅ Standardized label distribution:
 Normal    418
Attack    331
Name: label, dtype: int64
