In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('/content/drive/My Drive/FinalProject/output.csv')

In [4]:
df.head()

Unnamed: 0,Pcap File,First Packet Sizes,Total Packets,Total Bytes,Bits per Peak,Mean Packet Size,Variance Packet Size,Skewness Packet Size,Mean Inter-arrival Time,Variance Inter-arrival Time,Skewness Inter-arrival Time,Bandwidth (bytes/sec),Packets per Second,Flow Duration (sec),Source IPs,Destination IPs,Protocols,TTL Values,Label
0,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[1292, 1292, 1292, 1292, 54]",1866,537278,40928,287.930332,307273.492195,3.575812,0.019493,0.00545,7.21948,14779.132188,51.328848,36.353826,75,80,"SSL,MDNS,TLSv1.2,TCP,IGMPv2,DNS,TLSv1,TLSv1.3,...","48,54,36,240,231,233,39,226,98,228,40,215,47,4...",browsing
1,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[66, 66, 66, 66, 66]",3306,680312,46072,205.781004,191073.617382,4.704461,0.010998,0.006896,19.229919,18715.692057,90.949561,36.349818,79,84,"SSDP,SSL,TLSv1.2,TCP,DNS,TLSv1,TLSv1.3,QUIC","48,237,54,36,240,231,233,39,226,228,215,47,46,...",browsing
2,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[58, 54, 56, 83, 58]",3095,730200,45088,235.928918,238234.239936,4.162539,0.011137,0.002771,8.916954,21192.026226,89.823776,34.456356,72,74,"SSDP,SSL,TLSv1.2,TCP,IGMPv2,DNS,TLSv1,TLSv1.3,...","48,54,240,231,233,39,35,228,223,43,33,38,235,1...",browsing
3,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[1292, 1292, 1292, 1292, 66]",2157,1064778,115632,493.638387,605882.641902,5.398308,0.015901,0.011349,17.268042,31059.829182,62.920206,34.281515,44,44,"SSL,TCP,TLSv1.2,DNS,TLSv1,UDP,TLSv1.3,QUIC","36,240,233,39,226,228,46,43,33,236,38,128,56,2...",browsing
4,Data\browsing\bbc.com\delay --time 200ms\2024-...,"[54, 54, 54, 54, 54]",1556,547284,41432,351.724936,366038.629114,2.971549,0.02527,0.011533,7.66376,13927.406545,39.597439,39.295471,64,67,"SSDP,SSL,TLSv1.2,TCP,DNS,TLSv1,TLSv1.3,QUIC","237,54,36,240,231,233,39,35,47,43,33,225,38,12...",browsing


**Data Preprocessing**

In [5]:
df = df.drop(columns=['Pcap File'])

# Convert categorical features (Protocols, TTL Values) into numeric format
df['Protocols'] = df['Protocols'].apply(lambda x: len(x.split(',')))  # Number of protocols used
df['TTL Values'] = df['TTL Values'].apply(lambda x: len(x.split(',')))  # Number of TTL values observed

# Convert 'First Packet Sizes' from a list to the mean of the packet sizes
df['First Packet Sizes'] = df['First Packet Sizes'].apply(lambda x: np.mean(eval(x)))

# Encode the Label column
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

# X: features, y: labels
X = df.drop(columns=['Label'])
y = df['Label']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

**Model Building**

In [6]:
# Base classifiers for stacking
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
]

meta_model = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

**Model Training**

In [7]:
stacking_clf.fit(X_train, y_train)

**Model Evaluation**

In [8]:
y_pred = stacking_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.9848

Classification Report:
              precision    recall  f1-score   support

         VOD       1.00      0.95      0.98        22
    browsing       0.98      1.00      0.99        44

    accuracy                           0.98        66
   macro avg       0.99      0.98      0.98        66
weighted avg       0.99      0.98      0.98        66



**Cross-validation on the dataset**

In [9]:
cv_scores = cross_val_score(stacking_clf, X_scaled, y, cv=5)
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

Cross-validation Accuracy: 0.9848 ± 0.0136
