In [25]:
import pandas as pd

train_file = "/Users/akashthanneeru/Desktop/INADS_Data/Data/train_set_fixed.csv"
test_file = "/Users/akashthanneeru/Desktop/INADS_Data/Data/test_set_fixed.csv"

# Reload datasets
train_df = pd.read_csv(train_file, encoding="utf-8")
test_df = pd.read_csv(test_file, encoding="utf-8")

print("Train Dataset Loaded! Shape:", train_df.shape)
print("Test Dataset Loaded! Shape:", test_df.shape)

Train Dataset Loaded! Shape: (1618172, 16)
Test Dataset Loaded! Shape: (404543, 16)


In [27]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Prepare data (Remove label column for training, since Isolation Forest is unsupervised)
features = [col for col in train_df.columns if col != "Label"]

# Standardize the feature values for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[features])
X_test = scaler.transform(test_df[features])  # Use the same scaler for test set

# Train Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination="auto", random_state=42)
iso_forest.fit(X_train)

# Predict anomalies (-1 = anomaly, 1 = normal)
test_predictions = iso_forest.predict(X_test)

# Convert to binary format (to match ground truth: "Benign" → 1, "Attack" → -1)
y_test = np.where(test_df["Label"] == "Benign", 1, -1)

# Compare predictions vs. actual labels
conf_matrix = confusion_matrix(y_test, test_predictions)
class_report = classification_report(y_test, test_predictions, target_names=["Attack (-1)", "Benign (1)"])

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Confusion Matrix:
 [[  4524 133775]
 [ 36505 229739]]

Classification Report:
               precision    recall  f1-score   support

 Attack (-1)       0.11      0.03      0.05    138299
  Benign (1)       0.63      0.86      0.73    266244

    accuracy                           0.58    404543
   macro avg       0.37      0.45      0.39    404543
weighted avg       0.45      0.58      0.50    404543



In [29]:
# Check class distribution in train and test datasets
train_class_distribution = train_df["Label"].value_counts(normalize=True) * 100  # Percentage distribution
test_class_distribution = test_df["Label"].value_counts(normalize=True) * 100

# Display results
print("Train Set Distribution:\n", train_class_distribution)
print("\nTest Set Distribution:\n", test_class_distribution)

Train Set Distribution:
 Label
Benign                    65.813461
DDoS attacks-LOIC-HTTP    28.485229
Infilteration              3.117963
DoS attacks-GoldenEye      2.049473
DoS attacks-Slowloris      0.508475
Brute Force -Web           0.017921
Brute Force -XSS           0.007478
Name: proportion, dtype: float64

Test Set Distribution:
 Label
Benign                    65.813523
DDoS attacks-LOIC-HTTP    28.485229
Infilteration              3.118086
DoS attacks-GoldenEye      2.049473
DoS attacks-Slowloris      0.508475
Brute Force -Web           0.017798
Brute Force -XSS           0.007416
Name: proportion, dtype: float64


In [33]:
# Check statistical summary of features
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Bwd Pkt Len Mean,1618172.0,130.8282,176.3963,0.0,0.0,77.0,225.8571,1827.135
Flow IAT Mean,1618172.0,8449768.0,20198200.0,0.333333,7784.666667,190853.1,3003813.0,119990000.0
Fwd Pkt Len Mean,1618172.0,37.39131,56.59224,0.0,0.0,10.33333,51.0,2946.679
Flow IAT Std,1618172.0,1096791.0,4120390.0,0.0,0.0,5781.879,468101.4,84189120.0
TotLen Fwd Pkts,1618172.0,1930.431,143350.2,0.0,0.0,31.0,286.0,144391800.0
Flow Duration,1618172.0,17458860.0,33023180.0,1.0,14093.0,1412144.0,14418770.0,120000000.0
Bwd Pkts/s,1618172.0,3864.629,37479.11,0.0,0.0,2.438074,26.4573,2000000.0
Flow Pkts/s,1618172.0,23892.73,186537.7,0.016668,0.402143,5.791174,171.9986,4000000.0
Fwd IAT Std,1618172.0,1000956.0,3848971.0,0.0,0.0,9.192388,45704.44,84628980.0
Flow Byts/s,1618172.0,168015.8,3198542.0,0.0,0.0,614.0187,2399.163,1230000000.0


In [35]:
# Add predictions to the test dataset
test_df["Predicted_Label"] = test_predictions  # Assuming you stored Isolation Forest's output in 'test_predictions'

# Count misclassifications
misclassified = test_df[test_df["Label"] != test_df["Predicted_Label"]]

# Check which attack types were misclassified the most
print(misclassified["Label"].value_counts())

Label
Benign                    266244
DDoS attacks-LOIC-HTTP    115235
Infilteration              12614
DoS attacks-GoldenEye       8291
DoS attacks-Slowloris       2057
Brute Force -Web              72
Brute Force -XSS              30
Name: count, dtype: int64


In [11]:
import os

dataset_path = "/Users/akashthanneeru/Desktop/INADS_Data/Data/"
train_file = os.path.join(dataset_path, "train_set_fixed.csv")
test_file = os.path.join(dataset_path, "test_set_fixed.csv")

# Check if files exist and try reading the first line
for file in [train_file, test_file]:
    if os.path.exists(file):
        print(f" {os.path.basename(file)} exists! Checking readability...")
        with open(file, "r", encoding="utf-8") as f:
            print(f.readline())  # Print first line to verify format
    else:
        print(f" {os.path.basename(file)} does not exist!")

 train_set_fixed.csv exists! Checking readability...
Bwd Pkt Len Mean,Flow IAT Mean,Fwd Pkt Len Mean,Flow IAT Std,TotLen Fwd Pkts,Flow Duration,Bwd Pkts/s,Flow Pkts/s,Fwd IAT Std,Flow Byts/s,Fwd Pkt Len Max,Flow IAT Max,Init Fwd Win Byts,Fwd Seg Size Min,Dst Port,Label

 test_set_fixed.csv exists! Checking readability...
Bwd Pkt Len Mean,Flow IAT Mean,Fwd Pkt Len Mean,Flow IAT Std,TotLen Fwd Pkts,Flow Duration,Bwd Pkts/s,Flow Pkts/s,Fwd IAT Std,Flow Byts/s,Fwd Pkt Len Max,Flow IAT Max,Init Fwd Win Byts,Fwd Seg Size Min,Dst Port,Label



In [13]:
import os

dataset_path = "/Users/akashthanneeru/Desktop/INADS_Data/Data/"
train_file = os.path.abspath(os.path.join(dataset_path, "train_set_fixed.csv"))
test_file = os.path.abspath(os.path.join(dataset_path, "test_set_fixed.csv"))

print("Train Dataset Path:", train_file)
print("Test Dataset Path:", test_file)

print("Train File Exists:", os.path.exists(train_file))
print("Test File Exists:", os.path.exists(test_file))

Train Dataset Path: /Users/akashthanneeru/Desktop/INADS_Data/Data/train_set_fixed.csv
Test Dataset Path: /Users/akashthanneeru/Desktop/INADS_Data/Data/test_set_fixed.csv
Train File Exists: True
Test File Exists: True


In [15]:
with open("/Users/akashthanneeru/Desktop/INADS_Data/Data/train_set_fixed.csv", "r", encoding="utf-8") as f:
    print(f.readline())  # Print first line to confirm readability

with open("/Users/akashthanneeru/Desktop/INADS_Data/Data/test_set_fixed.csv", "r", encoding="utf-8") as f:
    print(f.readline())  # Print first line to confirm readability

Bwd Pkt Len Mean,Flow IAT Mean,Fwd Pkt Len Mean,Flow IAT Std,TotLen Fwd Pkts,Flow Duration,Bwd Pkts/s,Flow Pkts/s,Fwd IAT Std,Flow Byts/s,Fwd Pkt Len Max,Flow IAT Max,Init Fwd Win Byts,Fwd Seg Size Min,Dst Port,Label

Bwd Pkt Len Mean,Flow IAT Mean,Fwd Pkt Len Mean,Flow IAT Std,TotLen Fwd Pkts,Flow Duration,Bwd Pkts/s,Flow Pkts/s,Fwd IAT Std,Flow Byts/s,Fwd Pkt Len Max,Flow IAT Max,Init Fwd Win Byts,Fwd Seg Size Min,Dst Port,Label



In [19]:
import os

train_file = "/Users/akashthanneeru/Desktop/INADS_Data/Data/train_set_fixed.csv"
test_file = "/Users/akashthanneeru/Desktop/INADS_Data/Data/test_set_fixed.csv"

# Check file permissions
print("Train File Permissions:", oct(os.stat(train_file).st_mode)[-3:])
print("Test File Permissions:", oct(os.stat(test_file).st_mode)[-3:])

Train File Permissions: 644
Test File Permissions: 644


In [21]:
import os

dataset_path = "/Users/akashthanneeru/Desktop/INADS_Data/Data/"
print("Jupyter Directory Listing:")
print(os.listdir(dataset_path))

Jupyter Directory Listing:
['Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv', 'DoS_Attacks_Filtered.csv', 'Merged-Dataset-Final.csv', 'feature_extraction_final.csv', 'test_set_fixed.csv', 'Benign_Traffic.csv', 'Friday-16-02-2018_TrafficForML_CICFlowMeter.csv', 'train_set_fixed.csv', 'Friday-23-02-2018_TrafficForML_CICFlowMeter.csv']


In [23]:
import pandas as pd

train_file = "/Users/akashthanneeru/Desktop/INADS_Data/Data/train_set_fixed.csv"
test_file = "/Users/akashthanneeru/Desktop/INADS_Data/Data/test_set_fixed.csv"

# Force Pandas to read by explicitly opening the file
with open(train_file, "r", encoding="utf-8") as f:
    train_df = pd.read_csv(f)
    
with open(test_file, "r", encoding="utf-8") as f:
    test_df = pd.read_csv(f)

print("Train Dataset Shape:", train_df.shape)
print("Test Dataset Shape:", test_df.shape)

Train Dataset Shape: (1618172, 16)
Test Dataset Shape: (404543, 16)
