## Merging 4 csv files 

In [1]:
import pandas as pd
import os

# Update your path here:
data_dir = r"C:\Users\Shruti More\NIDS Project"
features_path = os.path.join(data_dir, "NUSW-NB15_features.csv")

# Load column names
features_df = pd.read_csv(features_path, encoding='latin1')
column_names = features_df['Name'].dropna().tolist()

# List of your 4 data CSV files (make sure the file names match exactly)
file_list = ["UNSW-NB15_1.csv", "UNSW-NB15_2.csv", "UNSW-NB15_3.csv", "UNSW-NB15_4.csv"]

all_dfs = []
for file in file_list:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path, header=None, names=column_names, low_memory=False)
    all_dfs.append(df)

combined_df = pd.concat(all_dfs, ignore_index=True)

print(f"Combined DataFrame shape: {combined_df.shape}")
print(combined_df.head())

Combined DataFrame shape: (2540047, 49)
        srcip  sport          dstip dsport proto state       dur  sbytes  \
0  59.166.0.0   1390  149.171.126.6     53   udp   CON  0.001055     132   
1  59.166.0.0  33661  149.171.126.9   1024   udp   CON  0.036133     528   
2  59.166.0.6   1464  149.171.126.7     53   udp   CON  0.001119     146   
3  59.166.0.5   3593  149.171.126.5     53   udp   CON  0.001209     132   
4  59.166.0.3  49664  149.171.126.0     53   udp   CON  0.001169     146   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0     164    31  ...           0           3           7          1   
1     304    31  ...           0           2           4          2   
2     178    31  ...           0          12           8          1   
3     164    31  ...           0           6           9          1   
4     178    31  ...           0           7           9          1   

   ct_src_ ltm  ct_src_dport_ltm  ct_dst_sport_ltm  ct_dst_src_ltm  \
0     

In [2]:
# Check unique attack categories and their counts
print("\nUnique attack categories:")
print(combined_df['attack_cat'].unique())

print("\nAttack category distribution:")
print(combined_df['attack_cat'].value_counts())


Unique attack categories:
[nan 'Exploits' 'Reconnaissance' 'DoS' 'Generic' 'Shellcode' ' Fuzzers'
 'Worms' 'Backdoors' 'Analysis' ' Reconnaissance ' 'Backdoor' ' Fuzzers '
 ' Shellcode ']

Attack category distribution:
attack_cat
Generic             215481
Exploits             44525
 Fuzzers             19195
DoS                  16353
 Reconnaissance      12228
 Fuzzers              5051
Analysis              2677
Backdoor              1795
Reconnaissance        1759
 Shellcode            1288
Backdoors              534
Shellcode              223
Worms                  174
Name: count, dtype: int64


In [3]:
# Strip extra spaces and replace inconsistent names
combined_df['attack_cat'] = combined_df['attack_cat'].str.strip()

# Optional: Standardize similar names (you can customize this)
combined_df['attack_cat'] = combined_df['attack_cat'].replace({
    'Backdoors': 'Backdoor',
    'Shellcode': 'Shellcode',  # ensure proper case
    'Fuzzers': 'Fuzzers',
    'Reconnaissance': 'Reconnaissance',
})

# View the cleaned categories
print("\nCleaned unique attack categories:")
print(combined_df['attack_cat'].unique())

print("\nCleaned attack category distribution:")
print(combined_df['attack_cat'].value_counts())


Cleaned unique attack categories:
[nan 'Exploits' 'Reconnaissance' 'DoS' 'Generic' 'Shellcode' 'Fuzzers'
 'Worms' 'Backdoor' 'Analysis']

Cleaned attack category distribution:
attack_cat
Generic           215481
Exploits           44525
Fuzzers            24246
DoS                16353
Reconnaissance     13987
Analysis            2677
Backdoor            2329
Shellcode           1511
Worms                174
Name: count, dtype: int64


## Handling Null Values

In [4]:
# Count of nulls in each column
null_counts = combined_df.isnull().sum()

# Show only columns with nulls
null_counts = null_counts[null_counts > 0]

print("\nColumns with null values:")
print(null_counts)


Columns with null values:
ct_flw_http_mthd    1348145
is_ftp_login        1429879
attack_cat          2218764
dtype: int64


In [5]:
combined_df.drop(['ct_flw_http_mthd', 'is_ftp_login'], axis=1, inplace=True)

In [6]:
# Droped col ct_flw_http_mthd', 'is_ftp_login'
print(combined_df.isnull().sum())

srcip                     0
sport                     0
dstip                     0
dsport                    0
proto                     0
state                     0
dur                       0
sbytes                    0
dbytes                    0
sttl                      0
dttl                      0
sloss                     0
dloss                     0
service                   0
Sload                     0
Dload                     0
Spkts                     0
Dpkts                     0
swin                      0
dwin                      0
stcpb                     0
dtcpb                     0
smeansz                   0
dmeansz                   0
trans_depth               0
res_bdy_len               0
Sjit                      0
Djit                      0
Stime                     0
Ltime                     0
Sintpkt                   0
Dintpkt                   0
tcprtt                    0
synack                    0
ackdat                    0
is_sm_ips_ports     

In [7]:
combined_df.drop(['Label'], axis=1, inplace=True)

In [8]:
# Droped col ct_flw_http_mthd', 'is_ftp_login'
print(combined_df.isnull().sum())

srcip                     0
sport                     0
dstip                     0
dsport                    0
proto                     0
state                     0
dur                       0
sbytes                    0
dbytes                    0
sttl                      0
dttl                      0
sloss                     0
dloss                     0
service                   0
Sload                     0
Dload                     0
Spkts                     0
Dpkts                     0
swin                      0
dwin                      0
stcpb                     0
dtcpb                     0
smeansz                   0
dmeansz                   0
trans_depth               0
res_bdy_len               0
Sjit                      0
Djit                      0
Stime                     0
Ltime                     0
Sintpkt                   0
Dintpkt                   0
tcprtt                    0
synack                    0
ackdat                    0
is_sm_ips_ports     

## Handling Missing values

In [9]:
print(combined_df['attack_cat'].isnull().sum())

2218764


In [10]:
combined_df['attack_cat'] = combined_df['attack_cat'].fillna('unknown')


In [11]:
print(combined_df['attack_cat'].isnull().sum())

0


In [12]:
# Separate the data into normal and attack
normal_df = combined_df[combined_df['attack_cat'] == 'Normal']  # or 'normal', depending on the label case
attack_df = combined_df[combined_df['attack_cat'] != 'Normal']

# Choose a subset of important columns to detect duplicates more meaningfully
subset_columns = ['srcip', 'dstip', 'sport', 'dsport', 'proto', 'state', 'sbytes', 'dbytes']

# Count duplicates in normal data
normal_duplicates = normal_df.duplicated(subset=subset_columns).sum()
print("Number of duplicates in NORMAL data:", normal_duplicates)

# Count duplicates in attack data
attack_duplicates = attack_df.duplicated(subset=subset_columns).sum()
print("Number of duplicates in ATTACK data:", attack_duplicates)


Number of duplicates in NORMAL data: 0
Number of duplicates in ATTACK data: 567981


In [13]:
important_cols = ['srcip', 'dstip','dsport','sport','proto', 'service', 'state', 'ct_ftp_cmd', 'attack_cat']
duplicates = df.duplicated(subset=important_cols)
print("Duplicate rows based on important features:", duplicates.sum())

Duplicate rows based on important features: 148599


In [14]:
for col in ['proto', 'state', 'service', 'ct_ftp_cmd']:
    print(f"{col}: {combined_df[col].nunique()} unique values")

proto: 135 unique values
state: 16 unique values
service: 13 unique values
ct_ftp_cmd: 13 unique values


In [15]:
# Make sure final_df is defined
final_df = combined_df.copy()

# Step 1: Identify the top 29 protocols
top_n = 29
top_protocols = final_df['proto'].value_counts().nlargest(top_n).index

# Step 2: Replace rare protocols with "Other"
final_df['proto'] = final_df['proto'].apply(lambda x: x if x in top_protocols else 'Other')

# Step 3: Optional - Check the result
print("Unique values in 'proto' after reduction:", final_df['proto'].nunique())
print(final_df['proto'].value_counts())


Unique values in 'proto' after reduction: 30
proto
tcp           1495074
udp            990435
unas            16202
Other           14055
arp             10064
ospf             7798
sctp             1525
icmp              524
any               411
gre               324
rsvp              274
ipv6              272
swipe             262
pim               262
sun-nd            262
mobile            262
sep               260
ipip              137
pri-enc           137
etherip           137
encap             137
aes-sp3-d         137
micp              137
sprite-rpc        137
ax.25             137
mtp               137
larp              137
eigrp             137
tcf               137
vmtp              137
Name: count, dtype: int64


In [16]:
#shape before encoding
final_df.shape

(2540047, 46)

In [17]:
# Replace less frequent protocols with 'Other'
final_df['proto'] = final_df['proto'].apply(lambda x: x if x in top_protocols else 'Other')


In [18]:
#shape before encoding
final_df.shape

(2540047, 46)

In [19]:
print(final_df['proto'].value_counts())
print("Unique protocols after grouping:", final_df['proto'].nunique())


proto
tcp           1495074
udp            990435
unas            16202
Other           14055
arp             10064
ospf             7798
sctp             1525
icmp              524
any               411
gre               324
rsvp              274
ipv6              272
swipe             262
pim               262
sun-nd            262
mobile            262
sep               260
ipip              137
pri-enc           137
etherip           137
encap             137
aes-sp3-d         137
micp              137
sprite-rpc        137
ax.25             137
mtp               137
larp              137
eigrp             137
tcf               137
vmtp              137
Name: count, dtype: int64
Unique protocols after grouping: 30


In [20]:
categorical_cols = ['proto', 'state', 'service', 'ct_ftp_cmd']


In [21]:
# Apply One-Hot Encoding
final_df = pd.get_dummies(final_df, columns=categorical_cols, drop_first=True)

# Show the new shape and sample
print("✅ New shape after encoding:", final_df.shape)
print(final_df.head())


✅ New shape after encoding: (2540047, 110)
        srcip  sport          dstip dsport       dur  sbytes  dbytes  sttl  \
0  59.166.0.0   1390  149.171.126.6     53  0.001055     132     164    31   
1  59.166.0.0  33661  149.171.126.9   1024  0.036133     528     304    31   
2  59.166.0.6   1464  149.171.126.7     53  0.001119     146     178    31   
3  59.166.0.5   3593  149.171.126.5     53  0.001209     132     164    31   
4  59.166.0.3  49664  149.171.126.0     53  0.001169     146     178    31   

   dttl  sloss  ...  ct_ftp_cmd_3  ct_ftp_cmd_4  ct_ftp_cmd_5  ct_ftp_cmd_6  \
0    29      0  ...         False         False         False         False   
1    29      0  ...         False         False         False         False   
2    29      0  ...         False         False         False         False   
3    29      0  ...         False         False         False         False   
4    29      0  ...         False         False         False         False   

   ct_ftp_cmd

In [22]:
# Convert True/False to 1/0 for the entire DataFrame
final_df = final_df.astype({col: int for col in final_df.select_dtypes(include='bool').columns})

# Check result
print("✅ Converted all boolean columns to 0 and 1.")
print(final_df.head())


✅ Converted all boolean columns to 0 and 1.
        srcip  sport          dstip dsport       dur  sbytes  dbytes  sttl  \
0  59.166.0.0   1390  149.171.126.6     53  0.001055     132     164    31   
1  59.166.0.0  33661  149.171.126.9   1024  0.036133     528     304    31   
2  59.166.0.6   1464  149.171.126.7     53  0.001119     146     178    31   
3  59.166.0.5   3593  149.171.126.5     53  0.001209     132     164    31   
4  59.166.0.3  49664  149.171.126.0     53  0.001169     146     178    31   

   dttl  sloss  ...  ct_ftp_cmd_3  ct_ftp_cmd_4  ct_ftp_cmd_5  ct_ftp_cmd_6  \
0    29      0  ...             0             0             0             0   
1    29      0  ...             0             0             0             0   
2    29      0  ...             0             0             0             0   
3    29      0  ...             0             0             0             0   
4    29      0  ...             0             0             0             0   

   ct_ftp_cm

## Selecting top 50 features

In [23]:
# Show only columns that have at least 1 NaN
null_columns = final_df.columns[final_df.isnull().any()]
print("Columns with missing values:")
print(final_df[null_columns].isnull().sum())


Columns with missing values:
Series([], dtype: float64)


In [24]:
final_df['attack_cat'] = final_df['attack_cat'].fillna('unknown')


In [25]:
print(final_df['attack_cat'].isnull().sum())  # should print 0


0


In [26]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Drop non-numeric or identifier columns (IPs and ports)
cols_to_drop = ['srcip', 'dstip', 'sport', 'dsport']
X = final_df.drop(columns=cols_to_drop + ['attack_cat'])  # 'attack_cat' is the target
y = final_df['attack_cat']

# Apply SelectKBest to select top 50 features
selector = SelectKBest(score_func=f_classif, k=50)
X_selected = selector.fit_transform(X, y)

# Get the names of selected features
selected_features = X.columns[selector.get_support()]
print("Top 50 selected features:\n", selected_features)


Top 50 selected features:
 Index(['sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'Sjit', 'Djit', 'Stime', 'Ltime', 'tcprtt',
       'synack', 'ackdat', 'ct_state_ttl', 'ct_srv_src', 'ct_srv_dst',
       'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'proto_any', 'proto_gre', 'proto_ospf', 'proto_sctp',
       'proto_tcp', 'proto_udp', 'proto_unas', 'state_CON', 'state_FIN',
       'state_INT', 'service_dns', 'service_ftp-data', 'service_http',
       'service_pop3', 'service_smtp', 'service_snmp', 'ct_ftp_cmd_ ',
       'ct_ftp_cmd_0'],
      dtype='object')


In [27]:
# Keep only selected features
X = X[selected_features]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (2032037, 50)
X_test shape: (508010, 50)
y_train shape: (2032037,)
y_test shape: (508010,)


## Modelling 

### Random Forest Classifier 

In [29]:
from sklearn.metrics import classification_report, accuracy_score
import time

# Example: Random Forest
from sklearn.ensemble import RandomForestClassifier

# 1. Initialize model
rf = RandomForestClassifier(random_state=42)

# 2. Train
start_train = time.time()
rf.fit(X_train, y_train)
end_train = time.time()

# 3. Predict
start_pred = time.time()
y_pred = rf.predict(X_test)
end_pred = time.time()

# 4. Evaluate
print("🎯 Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"🛠️ Training time: {end_train - start_train:.2f} seconds")
print(f"📤 Prediction time: {end_pred - start_pred:.2f} seconds")


🎯 Accuracy: 0.9822936556367
                precision    recall  f1-score   support

      Analysis       0.69      0.08      0.14       550
      Backdoor       0.83      0.08      0.15       477
           DoS       0.33      0.26      0.29      3315
      Exploits       0.62      0.81      0.70      8732
       Fuzzers       0.73      0.64      0.68      4784
       Generic       1.00      0.99      0.99     43110
Reconnaissance       0.93      0.78      0.85      2850
     Shellcode       0.66      0.65      0.65       320
         Worms       0.57      0.10      0.17        41
       unknown       1.00      1.00      1.00    443831

      accuracy                           0.98    508010
     macro avg       0.74      0.54      0.56    508010
  weighted avg       0.98      0.98      0.98    508010

🛠️ Training time: 613.44 seconds
📤 Prediction time: 11.42 seconds


In [30]:
import joblib


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

# Reuse X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols)
])

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# ✅ THIS IS REQUIRED
pipeline_rf.fit(X_train, y_train)

# ✅ Now save the trained pipeline
joblib.dump(pipeline_rf, 'pipeline_rf.pkl')
print("✅ Saved: pipeline_rf.pkl")


In [33]:
# ---- create a 5-row sample CSV for manual testing ----
import pandas as pd

# X is the feature matrix you used for training *after* SelectKBest
X = final_df[selected_features]        # make sure selected_features is your top-50 list

sample_df = X.head(5)                  # take first 5 rows (or .sample(5, random_state=42))
sample_path = "test_sample_5rows.csv"  # will be saved in current folder
sample_df.to_csv(sample_path, index=False)

print(f"✅ Saved test file: {sample_path}")
display(sample_df)                     # optional: show what you saved


✅ Saved test file: test_sample_5rows.csv


Unnamed: 0,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,Dpkts,...,state_FIN,state_INT,service_dns,service_ftp-data,service_http,service_pop3,service_smtp,service_snmp,ct_ftp_cmd_,ct_ftp_cmd_0
0,132,164,31,29,0,0,500473.9375,621800.9375,2,2,...,0,0,1,0,0,0,0,0,0,0
1,528,304,31,29,0,0,87676.08594,50480.17188,4,4,...,0,0,0,0,0,0,0,0,0,0
2,146,178,31,29,0,0,521894.5313,636282.375,2,2,...,0,0,1,0,0,0,0,0,0,0
3,132,164,31,29,0,0,436724.5625,542597.1875,2,2,...,0,0,1,0,0,0,0,0,0,0
4,146,178,31,29,0,0,499572.25,609067.5625,2,2,...,0,0,1,0,0,0,0,0,0,0


In [34]:
# Assuming your processed DataFrame is called 'final_df'
sample_df = final_df.head(5)

# Save to CSV without index column
sample_df.to_csv("manual_test_sample.csv", index=False)

print("✅ Saved: manual_test_sample.csv")


✅ Saved: manual_test_sample.csv


In [35]:
random_sample = combined_df.sample(n=5, random_state=42)
print(random_sample)


                  srcip  sport          dstip dsport proto state       dur  \
53230        59.166.0.5   6807  149.171.126.6     21   tcp   FIN  0.883703   
1823779      59.166.0.8   3083  149.171.126.8   1106   tcp   FIN  0.331832   
2479738  149.171.126.10   1043   175.45.176.0     53   udp   INT  0.000007   
923745       59.166.0.5  13382  149.171.126.2     53   udp   CON  0.001122   
1700724      59.166.0.7  34971  149.171.126.1     80   tcp   FIN  1.122783   

         sbytes  dbytes  sttl  ...  ct_state_ttl  ct_ftp_cmd  ct_srv_src  \
53230      2934    3742    31  ...             0           1           1   
1823779    5486   94690    31  ...             0                       5   
2479738     264       0    60  ...             0                      31   
923745      130     162    31  ...             0           0           1   
1700724    1580   10168    31  ...             0                       1   

        ct_srv_dst  ct_dst_ltm  ct_src_ ltm  ct_src_dport_ltm  \
53230    

In [36]:
random_sample = combined_df.sample(n=5, random_state=42)
print(random_sample)

                  srcip  sport          dstip dsport proto state       dur  \
53230        59.166.0.5   6807  149.171.126.6     21   tcp   FIN  0.883703   
1823779      59.166.0.8   3083  149.171.126.8   1106   tcp   FIN  0.331832   
2479738  149.171.126.10   1043   175.45.176.0     53   udp   INT  0.000007   
923745       59.166.0.5  13382  149.171.126.2     53   udp   CON  0.001122   
1700724      59.166.0.7  34971  149.171.126.1     80   tcp   FIN  1.122783   

         sbytes  dbytes  sttl  ...  ct_state_ttl  ct_ftp_cmd  ct_srv_src  \
53230      2934    3742    31  ...             0           1           1   
1823779    5486   94690    31  ...             0                       5   
2479738     264       0    60  ...             0                      31   
923745      130     162    31  ...             0           0           1   
1700724    1580   10168    31  ...             0                       1   

        ct_srv_dst  ct_dst_ltm  ct_src_ ltm  ct_src_dport_ltm  \
53230    

In [37]:
random_sample.to_csv("sample_5_rows.csv", index=False)


In [2]:
import pandas as pd

# Load the already saved combined dataset
combined_path = r"C:\Users\Shruti More\NIDS Project\UNSW_combined.csv"
combined_df = pd.read_csv(combined_path, low_memory=False)

# Filter attack rows only (non-normal)
attack_rows = combined_df[combined_df['attack_cat'].astype(str).str.lower().str.strip() != 'normal']

# Take 5 random attack rows
sampled_attacks = attack_rows.sample(n=5, random_state=42)

# Save to CSV
output_path = r"C:\Users\Shruti More\NIDS Project\attack_only_5_sample.csv"
sampled_attacks.to_csv(output_path, index=False)

print(f"✅ Saved 5 random attack rows to: {output_path}")


✅ Saved 5 random attack rows to: C:\Users\Shruti More\NIDS Project\attack_only_5_sample.csv


In [3]:
import pandas as pd

# Load the combined dataset
combined_path = r"C:\Users\Shruti More\NIDS Project\UNSW_combined.csv"
df = pd.read_csv(combined_path, low_memory=False)

# Clean and standardize the 'attack_cat' column
df['attack_cat'] = df['attack_cat'].astype(str).str.strip().str.lower()

# Filter for only 'dos' and 'exploits'
filtered_df = df[df['attack_cat'].isin(['dos', 'exploits'])]

# Take a random sample (change n=10 if you want more or fewer)
sampled_df = filtered_df.sample(n=10, random_state=42)

# Save the result
output_path = r"C:\Users\Shruti More\NIDS Project\dos_exploits_sample.csv"
sampled_df.to_csv(output_path, index=False)

print(f"✅ Saved DoS & Exploits sample to: {output_path}")


✅ Saved DoS & Exploits sample to: C:\Users\Shruti More\NIDS Project\dos_exploits_sample.csv
