In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('/kaggle/input/tii-ssrc-23/csv/data.csv')

**Data Exploration**

In [5]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Traffic Type,Traffic Subtype
0,192.168.1.90-192.168.1.3-53930-64738-6,192.168.1.90,53930.0,192.168.1.3,64738,6.0,01/01/1970 07:41:46 AM,52601173.0,1701.0,1793.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Audio,Audio
1,192.168.1.3-192.168.1.90-64738-37700-6,192.168.1.3,64738.0,192.168.1.90,37700,6.0,01/01/1970 07:41:46 AM,119106942.0,36.0,57.0,...,3416174.0,19996926.0,14078617.0,5001511.0,1737.400069,5003516.0,5000449.0,Benign,Audio,Audio
2,192.168.1.3-192.168.1.90-22-40854-6,192.168.1.3,22.0,192.168.1.90,40854,6.0,01/01/1970 07:41:46 AM,5589.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Audio,Audio
3,192.168.1.70-192.168.1.3-55422-64738-6,192.168.1.70,55422.0,192.168.1.3,64738,6.0,01/01/1970 07:41:47 AM,118166562.0,3932.0,4196.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Audio,Audio
4,192.168.1.90-192.168.1.3-59658-64738-17,192.168.1.90,59658.0,192.168.1.3,64738,17.0,01/01/1970 07:41:50 AM,119988385.0,25.0,6795.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Audio,Audio


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8656767 entries, 0 to 8656766
Data columns (total 86 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   Flow ID                     object 
 1   Src IP                      object 
 2   Src Port                    float64
 3   Dst IP                      object 
 4   Dst Port                    int64  
 5   Protocol                    float64
 6   Timestamp                   object 
 7   Flow Duration               float64
 8   Total Fwd Packet            float64
 9   Total Bwd packets           float64
 10  Total Length of Fwd Packet  float64
 11  Total Length of Bwd Packet  float64
 12  Fwd Packet Length Max       float64
 13  Fwd Packet Length Min       float64
 14  Fwd Packet Length Mean      float64
 15  Fwd Packet Length Std       float64
 16  Bwd Packet Length Max       float64
 17  Bwd Packet Length Min       float64
 18  Bwd Packet Length Mean      float64
 19  Bwd Packet Length Std

In [5]:
df.isnull().sum()

Flow ID            0
Src IP             0
Src Port           0
Dst IP             0
Dst Port           0
                  ..
Idle Max           0
Idle Min           0
Label              0
Traffic Type       0
Traffic Subtype    0
Length: 86, dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
df.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Len

In [None]:
df['Label'].value_counts()

Balance the dataset

In [6]:
benign_df = df[df['Label'] == 'Benign']
malicious_df = df[df['Label'] == 'Malicious']

malicious_downsampled = malicious_df.sample(frac=0.25, random_state=42)

reduced_df = pd.concat([benign_df, malicious_downsampled])

reduced_df['Label'].value_counts()

Label
Malicious    2163866
Benign          1301
Name: count, dtype: int64

In [7]:
del df

In [8]:
df_encoded = reduced_df.join(pd.get_dummies(reduced_df['Label'], prefix='Label'))

In [9]:
df_encoded = df_encoded.drop(['Flow ID', 'Src IP', 'Traffic Type', 'Traffic Subtype', 'Dst IP','Timestamp'], axis=1)

In [10]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2165167 entries, 0 to 2813507
Data columns (total 82 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   Src Port                    float64
 1   Dst Port                    int64  
 2   Protocol                    float64
 3   Flow Duration               float64
 4   Total Fwd Packet            float64
 5   Total Bwd packets           float64
 6   Total Length of Fwd Packet  float64
 7   Total Length of Bwd Packet  float64
 8   Fwd Packet Length Max       float64
 9   Fwd Packet Length Min       float64
 10  Fwd Packet Length Mean      float64
 11  Fwd Packet Length Std       float64
 12  Bwd Packet Length Max       float64
 13  Bwd Packet Length Min       float64
 14  Bwd Packet Length Mean      float64
 15  Bwd Packet Length Std       float64
 16  Flow Bytes/s                float64
 17  Flow Packets/s              float64
 18  Flow IAT Mean               float64
 19  Flow IAT Std              

In [13]:
benign_corr = df_encoded.drop(['Label', 'Label_Malicious'], axis=1).corr()['Label_Benign'][:-1]

# Calculate correlations for 'Malicious'
malicious_corr = df_encoded.drop(['Label', 'Label_Benign'], axis=1).corr()['Label_Malicious'][:-1]

# Visualization
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

# Add Bar chart for Benign correlations
fig.append_trace(
    go.Bar(
        name='Benign',
        x=benign_corr.index,
        y=benign_corr.values,
        marker=dict(color='blue')
    ),
    row=1, col=1
)

# Add Bar chart for Malicious correlations
fig.append_trace(
    go.Bar(
        name='Malicious',
        x=malicious_corr.index,
        y=malicious_corr.values,
        marker=dict(color='red')
    ),
    row=2, col=1
)

# Update layout
fig.update_layout(
    title_text="Feature Correlations with Target Labels (Reduced Malicious Samples)",
    template='ggplot2',
    height=800,
    xaxis_title="Features",
    yaxis_title="Correlation Coefficient",
    showlegend=False
)

fig.show(renderer='iframe_connected')

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Train the model
model = RandomForestClassifier()
model.fit(df_encoded.drop(['Label', 'Label_Benign','Label_Malicious'], axis=1), df_encoded['Label'])

# Get feature importances
importances = model.feature_importances_

# Convert the importances into a DataFrame
feature_importance_df = pd.DataFrame({
    'Feature': df_encoded.drop(['Label', 'Label_Benign','Label_Malicious'], axis=1).columns,
    'Importance': importances
})

In [19]:
fig = go.Figure(data=[go.Bar(x=feature_importance_df['Feature'], y=feature_importance_df['Importance'])])

# Update layout for better appearance
fig.update_layout(
    title="Feature Importances",
    xaxis_title="Features",
    yaxis_title="Importance",
    template="ggplot2"
)

# Show the plot
fig.show(renderer="iframe_connected")  # Use this renderer to ensure it displays

In [11]:
feature_columns = [
    "Total Fwd Packet",
    "Src Port",
    "Protocol",
    "Bwd Packet/Bulk Avg",
    "Fwd Seg Size Min",
    "Fwd Bytes/Bulk Avg",
    "Fwd Header Length",
    "Fwd Act Data Pkts",
    "Dst Port",
    "Total Length of Fwd Packet"
]
len(feature_columns)

10

In [12]:
X= df_encoded[feature_columns]
y= df_encoded['Label']


# Pre-Processing

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer

scaler = MinMaxScaler()
X= scaler.fit_transform(X)
X

array([[4.67862008e-03, 8.22919051e-01, 3.52941176e-01, ...,
        5.69297583e-02, 9.87838560e-01, 5.37797360e-03],
       [9.63245311e-05, 9.87838560e-01, 3.52941176e-01, ...,
        1.46336858e-03, 5.75265126e-01, 4.59119366e-05],
       [0.00000000e+00, 3.35698482e-04, 3.52941176e-01, ...,
        0.00000000e+00, 6.23392081e-01, 4.44771886e-06],
       ...,
       [2.75212946e-06, 1.07011521e-01, 3.52941176e-01, ...,
        4.72054381e-05, 0.00000000e+00, 3.58687005e-05],
       [0.00000000e+00, 4.74097810e-02, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.79343502e-06],
       [0.00000000e+00, 9.41145953e-01, 3.52941176e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.79343502e-05]])

In [15]:
X.shape

(2165167, 10)

In [14]:
y = y.replace({'Benign':0, 'Malicious':1}).astype(int)
y

  y = y.replace({'Benign':0, 'Malicious':1}).astype(int)


0          0
1          0
2          0
3          0
4          0
          ..
6862270    1
5984285    1
5297148    1
3876910    1
2813507    1
Name: Label, Length: 2165167, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

def build_model(input_shape, optimizer):
    model = Sequential()
    
    # Input and hidden layers - more gradual reduction in layer sizes
    model.add(Dense(512, activation='relu', input_shape=input_shape))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',  # Changed from sparse_categorical_crossentropy
        metrics=['accuracy']
    )
    return model

optimizer = Adam(learning_rate=0.001)
input_shape = (10,)
model = build_model(input_shape, optimizer)

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='loss', patience=6, min_delta=0.0001, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=0.0001)

In [20]:
history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    batch_size=1024,
                    epochs=300,
                    callbacks=[early_stopping, reduce_lr],
                    verbose=0)

In [23]:
from sklearn.metrics import classification_report, confusion_matrix
import plotly.io as pio

def performance_metrics(model, X_test, y_test):
    
    preds = model.predict(X_test)
  
    preds_labels = (preds >= 0.5).astype(int)

    target_names = ['Benign', 'Malicious']
    
    print(classification_report(y_test, preds_labels, target_names=target_names), '\n')

    cf_matrix = confusion_matrix(y_test, preds_labels, normalize='all')
    fig = px.imshow(pd.DataFrame(cf_matrix, columns=target_names, index=target_names), 
          template='ggplot2', title='Confusion Matrix', aspect='auto', text_auto=True, zmin=0,
          zmax=1, labels={'0':target_names[0],'1':target_names[1]})
    pio.show(fig)
    # fig.show(renderer='browser')

In [24]:
performance_metrics(model, X_test, y_test)

[1m13533/13533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step
              precision    recall  f1-score   support

      Benign       0.97      0.93      0.95       248
   Malicious       1.00      1.00      1.00    432786

    accuracy                           1.00    433034
   macro avg       0.99      0.96      0.97    433034
weighted avg       1.00      1.00      1.00    433034
 

