In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [29]:
# Load the data
df = pd.read_csv('/content/drive/My Drive/FinalProject/output.csv')

In [30]:
df.head()

Unnamed: 0,Pcap File,First Packet Sizes,Total Packets,Total Bytes,Bits per Peak,Mean Packet Size,Variance Packet Size,Skewness Packet Size,Mean Inter-arrival Time,Variance Inter-arrival Time,Skewness Inter-arrival Time,Bandwidth (bytes/sec),Packets per Second,Flow Duration (sec),Source IPs,Destination IPs,Protocols,TTL Values,Label
0,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[1292, 1292, 1292, 1292, 54]",1866,537278,40928,287.930332,307273.492195,3.575812,0.019493,0.00545,7.21948,14779.132188,51.328848,36.353826,75,80,"SSL,MDNS,TLSv1.2,TCP,IGMPv2,DNS,TLSv1,TLSv1.3,...","48,54,36,240,231,233,39,226,98,228,40,215,47,4...",browsing
1,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[66, 66, 66, 66, 66]",3306,680312,46072,205.781004,191073.617382,4.704461,0.010998,0.006896,19.229919,18715.692057,90.949561,36.349818,79,84,"SSDP,SSL,TLSv1.2,TCP,DNS,TLSv1,TLSv1.3,QUIC","48,237,54,36,240,231,233,39,226,228,215,47,46,...",browsing
2,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[58, 54, 56, 83, 58]",3095,730200,45088,235.928918,238234.239936,4.162539,0.011137,0.002771,8.916954,21192.026226,89.823776,34.456356,72,74,"SSDP,SSL,TLSv1.2,TCP,IGMPv2,DNS,TLSv1,TLSv1.3,...","48,54,240,231,233,39,35,228,223,43,33,38,235,1...",browsing
3,Data\browsing\bbc.com\corrupt --rate 5%\2024-0...,"[1292, 1292, 1292, 1292, 66]",2157,1064778,115632,493.638387,605882.641902,5.398308,0.015901,0.011349,17.268042,31059.829182,62.920206,34.281515,44,44,"SSL,TCP,TLSv1.2,DNS,TLSv1,UDP,TLSv1.3,QUIC","36,240,233,39,226,228,46,43,33,236,38,128,56,2...",browsing
4,Data\browsing\bbc.com\delay --time 200ms\2024-...,"[54, 54, 54, 54, 54]",1556,547284,41432,351.724936,366038.629114,2.971549,0.02527,0.011533,7.66376,13927.406545,39.597439,39.295471,64,67,"SSDP,SSL,TLSv1.2,TCP,DNS,TLSv1,TLSv1.3,QUIC","237,54,36,240,231,233,39,35,47,43,33,225,38,12...",browsing


In [31]:
# Drop non-numeric and irrelevant columns
df = df.drop(columns=["Pcap File", "Protocols", "TTL Values", "First Packet Sizes"])

# Drop rows with null values
df = df.dropna()

label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])  # Encode labels

# Splitting features and target
X = df.drop(columns=["Label"])  # Features
y = df["Label"]  # Target

In [32]:
# Ensure X and y are created properly
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (1444, 14)
y shape: (1444,)


In [33]:
# Train-test split (you can adjust the test size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)



In [34]:
print(X_train.isna().sum())
print(X_train[['Variance Packet Size']].isna().sum())
print(X_train[['Skewness Packet Size']].isna().sum())
print(X_train[X_train.isna().any(axis=1)])
print(X_train.dtypes)  # Check the data types of each feature in X_train



Total Packets                  0
Total Bytes                    0
Bits per Peak                  0
Mean Packet Size               0
Variance Packet Size           0
Skewness Packet Size           0
Mean Inter-arrival Time        0
Variance Inter-arrival Time    0
Skewness Inter-arrival Time    0
Bandwidth (bytes/sec)          0
Packets per Second             0
Flow Duration (sec)            0
Source IPs                     0
Destination IPs                0
dtype: int64
Variance Packet Size    0
dtype: int64
Skewness Packet Size    0
dtype: int64
Empty DataFrame
Columns: [Total Packets, Total Bytes, Bits per Peak, Mean Packet Size, Variance Packet Size, Skewness Packet Size, Mean Inter-arrival Time, Variance Inter-arrival Time, Skewness Inter-arrival Time, Bandwidth (bytes/sec), Packets per Second, Flow Duration (sec), Source IPs, Destination IPs]
Index: []
Total Packets                    int64
Total Bytes                      int64
Bits per Peak                    int64
Mean Packet S

In [35]:
# Fit the model
rf_classifier.fit(X_train, y_train)

In [36]:
# Predict on the test set
y_pred = rf_classifier.predict(X_test)

In [37]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9816


In [38]:
# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Classification Report:
               precision    recall  f1-score   support

          VOD       0.96      0.99      0.98       156
     browsing       0.99      0.97      0.98       153
file download       1.00      0.98      0.99       125

     accuracy                           0.98       434
    macro avg       0.98      0.98      0.98       434
 weighted avg       0.98      0.98      0.98       434



In [39]:
# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[155   1   0]
 [  5 148   0]
 [  1   1 123]]
