In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
# df = pd.read_csv('/content/drive/My Drive/FinalProject/output2.csv')
df = pd.read_csv('/content/drive/My Drive/FinalProject/behavioral_output2.csv')

In [4]:
df.head()

Unnamed: 0,Pcap File,Label,Session Duration,Packet Count,Byte Count,HTTP Count,HTTPS Count,FTP Count,Avg Packet Inter-Arrival Time,Flow Size,TCP SYN Count,TCP ACK Count,MIME Types,URL Patterns,Burstiness,Protocol Diversity,Steady Traffic,Volume Intensity
0,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,36.353826,1866,537278,0,505,0,0.019482,1866,153,1609,[],[],1,0.0,1,14779.132188
1,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,36.349818,3306,680312,0,510,0,0.010995,3306,394,2788,[],[],1,0.0,1,18715.692057
2,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,34.456356,3095,730200,0,555,0,0.011133,3095,319,2731,[],[],1,0.0,1,21192.026226
3,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,34.281515,2157,1064778,0,620,0,0.015893,2157,127,1819,[],[],1,0.0,1,31059.829182
4,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,browsing,41.467427,20922,14689837,0,2520,0,0.001982,20922,443,7479,[],[],1,0.0,1,354250.02376


In [5]:
label_column = 'Label'

# Drop columns that are labels or non numeric columns
feature_columns = [col for col in df.columns if col != label_column and df[col].dtype != 'object']

X = df[feature_columns]
y = df[label_column]

In [6]:
# Standardize the feature set
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [8]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [9]:
y_pred = clf.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 96.65%


In [11]:
# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

          VOD       0.96      0.97      0.96        67
     browsing       0.97      0.97      0.97       152
file download       0.96      0.94      0.95        50

     accuracy                           0.97       269
    macro avg       0.96      0.96      0.96       269
 weighted avg       0.97      0.97      0.97       269



In [12]:
# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[ 65   1   1]
 [  3 148   1]
 [  0   3  47]]


In [13]:
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                          Feature  Importance
4                     HTTPS Count    0.160590
0                Session Duration    0.147702
8                   TCP SYN Count    0.139675
9                   TCP ACK Count    0.123501
13               Volume Intensity    0.077951
1                    Packet Count    0.074758
6   Avg Packet Inter-Arrival Time    0.062861
12                 Steady Traffic    0.062464
2                      Byte Count    0.061867
7                       Flow Size    0.049360
3                      HTTP Count    0.018441
11             Protocol Diversity    0.017110
10                     Burstiness    0.003719
5                       FTP Count    0.000000
