<a href="https://colab.research.google.com/github/Raka7317/set_project_work/blob/main/3rd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas numpy scikit-learn river tensorflow


Collecting river
  Downloading river-0.23.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading river-0.23.0-cp312-cp312-manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, river
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not

In [None]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score

from river.drift import PageHinkley

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
DATA_PATH = "cleaned_dataset.csv"

df = pd.read_csv(DATA_PATH)
df = df[["url", "labels"]].dropna()

print("Dataset size:", len(df))
print(df["labels"].value_counts())

Dataset size: 18275
labels
0.0    16941
1.0     1334
Name: count, dtype: int64


In [None]:
def url_entropy(url):
    probs = [url.count(c) / len(url) for c in set(url)]
    return -sum(p * math.log2(p) for p in probs)

def extract_numeric_features(url):
    return [
        len(url),
        url.count('.'),
        sum(c.isdigit() for c in url),
        sum(c in "-@=?%" for c in url),
        url_entropy(url)
    ]


In [None]:
hash_vectorizer = HashingVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    n_features=2**14,
    alternate_sign=False
)

def extract_features(urls):
    numeric = np.array([extract_numeric_features(u) for u in urls])
    hashed = hash_vectorizer.transform(urls)
    return np.hstack([numeric, hashed.toarray()])


In [None]:
MAX_VOCAB = 5000
MAX_LEN = 100
EMBED_DIM = 64

tokenizer = Tokenizer(char_level=True, lower=True, oov_token="<UNK>")
tokenizer.fit_on_texts(df["url"])

sequences = tokenizer.texts_to_sequences(df["url"])
X_seq = pad_sequences(sequences, maxlen=MAX_LEN, padding="post", truncating="post")
y_seq = df["labels"].values

X_train_seq, X_val_seq, y_train_seq, y_val_seq = train_test_split(
    X_seq, y_seq, test_size=0.2, random_state=42
)

In [None]:
input_layer = Input(shape=(MAX_LEN,))
embed = Embedding(MAX_VOCAB, EMBED_DIM)(input_layer)
bilstm = Bidirectional(LSTM(64))(embed)
drop = Dropout(0.3)(bilstm)
output = Dense(1, activation="sigmoid")(drop)

deep_model = Model(input_layer, output)
deep_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

deep_model.summary()


In [None]:
deep_model.fit(
    X_train_seq,
    y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=5,
    batch_size=128,
    verbose=1
)

deep_model.save("bilstm_model.h5")


Epoch 1/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 191ms/step - accuracy: 0.8927 - loss: 0.3098 - val_accuracy: 0.9415 - val_loss: 0.1162
Epoch 2/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 173ms/step - accuracy: 0.9756 - loss: 0.0659 - val_accuracy: 0.9923 - val_loss: 0.0246
Epoch 3/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 169ms/step - accuracy: 0.9924 - loss: 0.0221 - val_accuracy: 0.9973 - val_loss: 0.0118
Epoch 4/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 168ms/step - accuracy: 0.9953 - loss: 0.0161 - val_accuracy: 0.9975 - val_loss: 0.0129
Epoch 5/5
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 182ms/step - accuracy: 0.9961 - loss: 0.0154 - val_accuracy: 0.9937 - val_loss: 0.0211




In [None]:
def deep_model_score(urls):
    seqs = tokenizer.texts_to_sequences(urls)
    seqs = pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")
    return deep_model.predict(seqs, verbose=0).flatten()


In [None]:
online_model = SGDClassifier(
    loss="log_loss",
    penalty="l1",
    learning_rate="optimal"
)

INIT_SIZE = 500
X_init = extract_features(df["url"][:INIT_SIZE])
y_init = df["labels"][:INIT_SIZE]

online_model.partial_fit(X_init, y_init, classes=[0,1])

In [None]:
legit_urls = df[df["labels"] == 0]["url"][:500]
X_legit = extract_features(legit_urls)

anomaly_model = IsolationForest(
    n_estimators=100,
    contamination=0.1,
    random_state=42
)

anomaly_model.fit(X_legit)


In [None]:
drift_detector = PageHinkley()


In [None]:
def fuse_scores(online_pred, deep_score, anomaly_score,
                w1=0.5, w2=0.3, w3=0.2):

    anomaly_flag = 1 if anomaly_score < -0.1 else 0
    score = w1*online_pred + w2*deep_score + w3*anomaly_flag
    return 1 if score >= 0.5 else 0


In [None]:
y_true_all = []
y_pred_all = []

for i in range(INIT_SIZE, len(df)):
    url = df["url"].iloc[i]
    y_true = df["labels"].iloc[i]

    X = extract_features([url])

    online_pred = online_model.predict(X)[0]
    deep_score = deep_model_score([url])[0]
    anomaly_score = anomaly_model.decision_function(X)[0]

    final_pred = fuse_scores(online_pred, deep_score, anomaly_score)

    y_true_all.append(y_true)
    y_pred_all.append(final_pred)

    online_model.partial_fit(X, [y_true])

    error = int(final_pred != y_true)
    drift_detector.update(error)

    if drift_detector.drift_detected:
        print(f"⚠️ Concept drift detected at index {i}")


⚠️ Concept drift detected at index 6837
⚠️ Concept drift detected at index 16907


In [None]:
print("\nFINAL STREAM RESULTS")
print("Accuracy :", accuracy_score(y_true_all, y_pred_all))
print("Precision:", precision_score(y_true_all, y_pred_all))
print("Recall   :", recall_score(y_true_all, y_pred_all))



FINAL STREAM RESULTS
Accuracy : 0.8739240506329113
Precision: 0.1328125
Recall   : 0.13066871637202152
