# Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score


# Load Dataset

In [4]:
# STEP 2: Load and Preprocess Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Load CSV, skip header
dataset = pd.read_csv('GCseq25.csv', sep=',', skiprows=1)

# Extract label (SNI) and features (packet size sequence)
y = dataset.iloc[:, 0]             # Column 0 = SNI (label)
X = dataset.iloc[:, 1:]            # Columns 1-100 = packet size values

# Normalize features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Keep only the top N most frequent classes
N = 25  # You can adjust this
top_classes = pd.Series(y).value_counts().nlargest(N).index

# Filter rows that belong to the top N classes
filtered_indices = y.isin(top_classes)
X_top = X_scaled[filtered_indices]
y_top = y[filtered_indices]

# Encode labels (string → integer → one-hot)
label_encoder = LabelEncoder()
y_top_encoded = label_encoder.fit_transform(y_top)
y_top_categorical = to_categorical(y_top_encoded)

# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_top, y_top_categorical, test_size=0.2, random_state=42, stratify=y_top_encoded
)

# Logging
print(f"Filtered to top {N} classes")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"Number of classes: {y_top_categorical.shape[1]}")

# Optional: print class names
print("\nClasses included (Top N SNIs):")
for idx, class_name in enumerate(label_encoder.classes_):
    print(f"Class {idx}: {class_name}")


Filtered to top 25 classes
X_train shape: (5897, 100)
y_train shape: (5897, 25)
Number of classes: 25

Classes included (Top N SNIs):
Class 0: ads.yahoo.com
Class 1: ae.akamai.net
Class 2: assets.adobedtm.com
Class 3: beacon.krxd.net
Class 4: c.betrad.com
Class 5: cdn.nhadatso.com
Class 6: clients.google.com
Class 7: d.adroll.com
Class 8: dt.adsafeprotected.com
Class 9: facebook.com
Class 10: fls.doubleclick.net
Class 11: google.com
Class 12: google.fr
Class 13: gstatic.com
Class 14: ib.adnxs.com
Class 15: l.betrad.com
Class 16: mc.yandex.ru
Class 17: nexus.ensighten.com
Class 18: p.rfihub.com
Class 19: pixel.quantserve.com
Class 20: s.adroll.com
Class 21: secure.adnxs.com
Class 22: ssl.gstatic.com
Class 23: staticxx.facebook.com
Class 24: tags.tiqcdn.com


In [5]:
y_train_rf = np.argmax(y_train, axis=1)
y_test_rf = np.argmax(y_test, axis=1)


# Train Model

Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train_rf)
rf_proba = rf_model.predict_proba(X_test)  # shape: (samples, num_classes)


In [7]:
# Reshape input to (samples, timesteps, features)
# Treat each sample as 100 time steps with 1 feature per step
X_train_seq = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_seq = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

print("X_train_seq shape:", X_train_seq.shape)


X_train_seq shape: (5897, 100, 1)


Deep Learning (LSTM)

In [8]:
# Fitting the model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout

cnn_lstm = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(100, 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

cnn_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

cnn_lstm.fit(X_train_seq, y_train, validation_split=0.1, epochs=20, batch_size=128)
cnn_proba = cnn_lstm.predict(X_test_seq)  # shape: (samples, num_classes)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 113ms/step - accuracy: 0.1368 - loss: 3.1313 - val_accuracy: 0.1898 - val_loss: 2.9145
Epoch 2/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 78ms/step - accuracy: 0.1885 - loss: 2.9620 - val_accuracy: 0.1898 - val_loss: 2.8697
Epoch 3/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 107ms/step - accuracy: 0.2039 - loss: 2.8524 - val_accuracy: 0.2542 - val_loss: 2.5764
Epoch 4/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 85ms/step - accuracy: 0.2914 - loss: 2.5138 - val_accuracy: 0.2881 - val_loss: 2.2195
Epoch 5/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 80ms/step - accuracy: 0.3528 - loss: 2.1904 - val_accuracy: 0.4305 - val_loss: 1.9594
Epoch 6/20
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 106ms/step - accuracy: 0.4031 - loss: 2.0127 - val_accuracy: 0.4729 - val_loss: 1.8338
Epoch 7/20
[1m42/42[0m [32m

In [9]:
# Ensure both predictions are aligned in shape
assert rf_proba.shape == cnn_proba.shape

# Average predicted probabilities
ensemble_proba = (rf_proba + cnn_proba) / 2
ensemble_pred = np.argmax(ensemble_proba, axis=1)


In [10]:

from sklearn.metrics import accuracy_score, classification_report

ensemble_accuracy = accuracy_score(y_test_rf, ensemble_pred)
print(f"\n🧠 Hybrid RF + CNN-LSTM Ensemble Accuracy: {ensemble_accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test_rf, ensemble_pred, target_names=label_encoder.classes_))



🧠 Hybrid RF + CNN-LSTM Ensemble Accuracy: 90.71%

Classification Report:
                        precision    recall  f1-score   support

         ads.yahoo.com       0.86      0.94      0.90        34
         ae.akamai.net       0.93      0.82      0.87        33
   assets.adobedtm.com       0.73      0.59      0.66        32
       beacon.krxd.net       1.00      0.91      0.95        32
          c.betrad.com       0.84      0.94      0.88        65
      cdn.nhadatso.com       0.95      1.00      0.97        37
    clients.google.com       0.86      0.97      0.91        37
          d.adroll.com       0.93      0.96      0.95        85
dt.adsafeprotected.com       0.92      0.97      0.94        34
          facebook.com       0.97      0.89      0.93        66
   fls.doubleclick.net       0.98      0.98      0.98        56
            google.com       0.91      0.94      0.93        33
             google.fr       1.00      0.97      0.98       177
           gstatic.com       