In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
# df = pd.read_csv('/content/drive/My Drive/FinalProject/output2.csv')
df = pd.read_csv('/content/drive/My Drive/FinalProject/behavioral_output2.csv')

In [4]:
df.head()

Unnamed: 0,Pcap File,Label,Session Duration,Packet Count,Byte Count,HTTP Count,HTTPS Count,FTP Count,Avg Packet Inter-Arrival Time,Flow Size,TCP SYN Count,TCP ACK Count,MIME Types,URL Patterns,Burstiness,Protocol Diversity,Steady Traffic,Volume Intensity
0,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,36.353826,1866,537278,0,505,0,0.019482,1866,153,1609,[],[],1,0.00198,1,14779.132188
1,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,36.349818,3306,680312,0,510,0,0.010995,3306,394,2788,[],[],1,0.001961,1,18715.692057
2,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,34.456356,3095,730200,0,555,0,0.011133,3095,319,2731,[],[],1,0.001802,1,21192.026226
3,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,34.281515,2157,1064778,0,620,0,0.015893,2157,127,1819,[],[],1,0.001613,1,31059.829182
4,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,41.467427,20922,14689837,0,2520,0,0.001982,20922,443,7479,[],[],1,0.000397,1,354250.02376


In [5]:
label_column = 'Label'

# Drop columns that are labels or non numeric columns
feature_columns = [col for col in df.columns if col != label_column and df[col].dtype != 'object']

X = df[feature_columns]
y = df[label_column]

In [6]:
df.head()

Unnamed: 0,Pcap File,Label,Session Duration,Packet Count,Byte Count,HTTP Count,HTTPS Count,FTP Count,Avg Packet Inter-Arrival Time,Flow Size,TCP SYN Count,TCP ACK Count,MIME Types,URL Patterns,Burstiness,Protocol Diversity,Steady Traffic,Volume Intensity
0,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,36.353826,1866,537278,0,505,0,0.019482,1866,153,1609,[],[],1,0.00198,1,14779.132188
1,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,36.349818,3306,680312,0,510,0,0.010995,3306,394,2788,[],[],1,0.001961,1,18715.692057
2,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,34.456356,3095,730200,0,555,0,0.011133,3095,319,2731,[],[],1,0.001802,1,21192.026226
3,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,34.281515,2157,1064778,0,620,0,0.015893,2157,127,1819,[],[],1,0.001613,1,31059.829182
4,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,41.467427,20922,14689837,0,2520,0,0.001982,20922,443,7479,[],[],1,0.000397,1,354250.02376


In [7]:
# Standardize the feature set
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [9]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [10]:
y_pred = clf.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.62%


In [12]:
# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

          VOD       0.97      0.99      0.98       152
     browsing       1.00      0.97      0.98       159
file download       0.99      1.00      1.00       123

     accuracy                           0.99       434
    macro avg       0.99      0.99      0.99       434
 weighted avg       0.99      0.99      0.99       434



In [13]:
# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[151   0   1]
 [  5 154   0]
 [  0   0 123]]


In [14]:
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                          Feature  Importance
0                Session Duration    0.214334
8                   TCP SYN Count    0.133627
2                      Byte Count    0.115369
7                       Flow Size    0.108924
9                   TCP ACK Count    0.079645
1                    Packet Count    0.076141
12                 Steady Traffic    0.071903
4                     HTTPS Count    0.056883
11             Protocol Diversity    0.049757
13               Volume Intensity    0.043069
6   Avg Packet Inter-Arrival Time    0.030210
3                      HTTP Count    0.019346
10                     Burstiness    0.000791
5                       FTP Count    0.000000
