In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Adjust the path based on the exact structure of your dataset directory
file_path = '/kaggle/input/svmdata/Tuesday-WorkingHours.pcap_ISCX.csv'  # Update this path

# Load the dataset
df = pd.read_csv(file_path)


# Display the first few rows of the DataFrame to understand what the data looks like
print(df.head())

# Display the summary information of the DataFrame to check data types and missing values
print(df.info())



    Destination Port   Flow Duration   Total Fwd Packets  \
0                 88             640                   7   
1                 88             900                   9   
2                 88            1205                   7   
3                 88             511                   7   
4                 88             773                   9   

    Total Backward Packets  Total Length of Fwd Packets  \
0                        4                          440   
1                        4                          600   
2                        4                         2776   
3                        4                          452   
4                        4                          612   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                           358                     220   
1                          2944                     300   
2                          2830                    1388   
3                           370                 

In [2]:

# Calculate the count of missing values per column
missing_values_count = df.isnull().sum()

# Filter and print columns with missing values along with their count
columns_with_missing_values = missing_values_count[missing_values_count > 0]

# Check if there are any columns with missing values and print details
if columns_with_missing_values.empty:
    print("No missing values in the dataset.")
else:
    print("Columns with missing values and their counts:")
    print(columns_with_missing_values)

Columns with missing values and their counts:
Flow Bytes/s    201
dtype: int64


In [3]:
# Drop the 'Flow Bytes/s' column from the DataFrame
df = df.drop('Flow Bytes/s', axis=1)

# Recalculate the count of missing values per column after dropping the column
missing_values_count_after_drop = df.isnull().sum()

# Filter and print columns with missing values along with their count, if any remain
columns_with_missing_values_after_drop = missing_values_count_after_drop[missing_values_count_after_drop > 0]

# Check if there are any columns with missing values after dropping and print details
if columns_with_missing_values_after_drop.empty:
    print("No missing values in the dataset after dropping the column.")
else:
    print("Columns with missing values and their counts after dropping the column:")
    print(columns_with_missing_values_after_drop)


No missing values in the dataset after dropping the column.


In [4]:
# Calculate the count of NaN values per column
nan_values_count = df.isna().sum()

# Filter and print columns with NaN values along with their count
columns_with_nan_values = nan_values_count[nan_values_count > 0]

# Check if there are any columns with NaN values and print details
if columns_with_nan_values.empty:
    print("No NaN values in the dataset.")
else:
    print("Columns with NaN values and their counts:")
    print(columns_with_nan_values)


No NaN values in the dataset.


In [5]:
# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Verify the updated column names
print(df.columns)


Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Packets/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length',
       'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length',
       'Max Packet Length', 'Packet Length Mean', 'Packet Length Std',
       'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count',
       'RST Flag 

In [6]:
import numpy as np

# Select only numeric columns for checking infinite values
numeric_df = df.select_dtypes(include=[np.number])

# Check for infinite values and count them in each numeric column
inf_values_count = numeric_df.replace([np.inf, -np.inf], np.nan).isna().sum()

# Filter and print columns with infinite values along with their count
columns_with_inf_values = inf_values_count[inf_values_count > 0]

# Check if there are any columns with infinite values and print details
if columns_with_inf_values.empty:
    print("No infinite values in the dataset.")
else:
    print("Columns with infinite values and their counts:")
    print(columns_with_inf_values)


Columns with infinite values and their counts:
Flow Packets/s    264
dtype: int64


In [7]:
# Drop columns containing infinite values
df = df.drop(columns_with_inf_values.index, axis=1)

# Verify the columns have been dropped by displaying the updated DataFrame's columns
print("Columns after removing those with infinite values:")
print(df.columns)
# Recheck for infinite values across numeric columns
numeric_df = df.select_dtypes(include=[np.number])
new_inf_values_count = numeric_df.replace([np.inf, -np.inf], np.nan).isna().sum()
new_columns_with_inf_values = new_inf_values_count[new_inf_values_count > 0]

# Display the results
if new_columns_with_inf_values.empty:
    print("No infinite values in the dataset after removing the problematic columns.")
else:
    print("Columns with infinite values and their counts after rechecking:")
    print(new_columns_with_inf_values)


Columns after removing those with infinite values:
Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow IAT Mean', 'Flow IAT Std',
       'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean',
       'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'S

In [8]:
# Count the unique values and their counts in the 'Label' column
label_counts = df['Label'].value_counts()

# Print the unique labels and their counts
print("Unique labels and their counts:")
print(label_counts)



Unique labels and their counts:
Label
BENIGN         432074
FTP-Patator      7938
SSH-Patator      5897
Name: count, dtype: int64


In [9]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Perform label encoding on the label column
df['Label'] = label_encoder.fit_transform(df['Label']).astype('float64')

# Print the unique encoded labels
print("Unique encoded labels:", df['Label'].unique())


Unique encoded labels: [0. 1. 2.]


In [10]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and labels (y)
X = df.drop(columns=['Label'])  # Features
y = df['Label']  # Labels

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Print the shapes of the training and testing sets
print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)


Training set shape (X_train, y_train): (312136, 76) (312136,)
Testing set shape (X_test, y_test): (133773, 76) (133773,)


In [13]:
# Extracting features for both types of attacks
common_features = ['Init_Win_bytes_forward',  
    'Subflow Fwd Bytes',  
    'Total Length of Fwd Packets',  
    'ACK Flag Count',  
    'Fwd PSH Flags',  
    'SYN Flag Count',  
    'Fwd Packets/s']

# Subset the training and testing data with common features
X_train_common = X_train[common_features]
X_test_common = X_test[common_features]


In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the decision tree classifier (J48 algorithm)
j48_model = DecisionTreeClassifier()

# Train the model
j48_model.fit(X_train_common, y_train)

# Make predictions
y_pred_j48 = j48_model.predict(X_test_common)

# Evaluate the model
accuracy_j48 = accuracy_score(y_test, y_pred_j48)
print("Accuracy (J48):", accuracy_j48)

# Other evaluation metrics
print("\nClassification Report (J48):\n", classification_report(y_test, y_pred_j48))
print("\nConfusion Matrix (J48):\n", confusion_matrix(y_test, y_pred_j48))


Accuracy (J48): 0.9994767254976714

Classification Report (J48):
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    129493
         1.0       0.99      1.00      0.99      2427
         2.0       0.99      0.99      0.99      1853

    accuracy                           1.00    133773
   macro avg       0.99      1.00      0.99    133773
weighted avg       1.00      1.00      1.00    133773


Confusion Matrix (J48):
 [[129449     24     20]
 [     5   2422      0]
 [    21      0   1832]]
