<a href="https://colab.research.google.com/github/Raka7317/set_project_work/blob/main/2nd_realtime_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
pip install torch pandas numpy scikit-learn




In [3]:
# =====================================================
# 1. IMPORT LIBRARIES
# =====================================================
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

device = "cpu"

# =====================================================
# 2. LOAD DATASET (LIMIT TO 50,000)
# =====================================================
CSV_PATH = "cleaned_dataset.csv"   # ♥● CHANGE THIS

df = pd.read_csv(CSV_PATH)

URL_COL = "url"      # ♥● change if needed
LABEL_COL = "labels"  # ♥● change if needed

df = df[[URL_COL, LABEL_COL]].dropna()

MAX_SAMPLES = 50000
if len(df) > MAX_SAMPLES:
    df = df.sample(n=MAX_SAMPLES, random_state=42).reset_index(drop=True)

print("Dataset loaded")
print("Total samples used:", len(df))

urls = df[URL_COL].astype(str).tolist()
labels = torch.tensor(df[LABEL_COL].values, dtype=torch.float32).unsqueeze(1)

# =====================================================
# 3. TRAIN–TEST SPLIT
# =====================================================
urls_train, urls_test, y_train, y_test = train_test_split(
    urls,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print("Train size:", len(urls_train))
print("Test size :", len(urls_test))

# =====================================================
# 4. CHARACTER TOKENIZER
# =====================================================
MAX_LEN = 128

char2idx = {"<PAD>": 0}
for url in urls_train:
    for ch in url:
        if ch not in char2idx:
            char2idx[ch] = len(char2idx)

def encode_url(url):
    seq = [char2idx.get(ch, 0) for ch in url[:MAX_LEN]]
    return seq + [0] * (MAX_LEN - len(seq))

X_train = torch.tensor([encode_url(u) for u in urls_train], dtype=torch.long)
X_test  = torch.tensor([encode_url(u) for u in urls_test], dtype=torch.long)

VOCAB_SIZE = len(char2idx)
print("Vocabulary size:", VOCAB_SIZE)

# =====================================================
# 5. TCN + ATTENTION MODEL
# =====================================================
class Attention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.fc = nn.Linear(dim, 1)

    def forward(self, x):
        weights = torch.softmax(self.fc(x), dim=1)
        return (weights * x).sum(dim=1)

class TCNWithAttention(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 32, padding_idx=0)
        self.conv1 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 64, kernel_size=3, padding=2, dilation=2)
        self.attn = Attention(64)
        self.fc = nn.Linear(64, 1)

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.transpose(1, 2)
        x = self.attn(x)
        return self.fc(x)

tcn = TCNWithAttention(VOCAB_SIZE).to(device)

# =====================================================
# 6. TRAIN TCN MODEL
# =====================================================
optimizer = torch.optim.Adam(tcn.parameters(), lr=0.001)
loss_fn = nn.BCEWithLogitsLoss()

print("\nTraining TCN model...")
for epoch in range(5):
    optimizer.zero_grad()
    logits = tcn(X_train)
    loss = loss_fn(logits, y_train)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

# =====================================================
# 7. GET TCN SCORES
# =====================================================
tcn.eval()
with torch.no_grad():
    train_tcn_scores = torch.sigmoid(tcn(X_train)).squeeze()
    test_tcn_scores  = torch.sigmoid(tcn(X_test)).squeeze()

# =====================================================
# 8. TOKEN-BASED FEATURE
# =====================================================
SUSPICIOUS_TOKENS = [
    "login", "verify", "secure",
    "update", "account", "signin",
    "confirm", "password"
]

def token_score(url):
    url = url.lower()
    count = sum(tok in url for tok in SUSPICIOUS_TOKENS)
    return min(count / 3, 1.0)

train_token_scores = torch.tensor([token_score(u) for u in urls_train])
test_token_scores  = torch.tensor([token_score(u) for u in urls_test])

# =====================================================
# 9. HYBRID FUSION MLP
# =====================================================
class HybridMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

fusion = HybridMLP().to(device)

# =====================================================
# 10. TRAIN HYBRID MODEL
# =====================================================
fusion_optimizer = torch.optim.Adam(fusion.parameters(), lr=0.01)
fusion_loss = nn.BCELoss()

train_features = torch.stack(
    [train_tcn_scores, train_token_scores], dim=1
)

print("\nTraining Hybrid Fusion model...")
for epoch in range(5):
    fusion_optimizer.zero_grad()
    preds = fusion(train_features)
    loss = fusion_loss(preds, y_train)
    loss.backward()
    fusion_optimizer.step()
    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

# =====================================================
# 11. EVALUATION
# =====================================================
fusion.eval()
with torch.no_grad():
    test_features = torch.stack(
        [test_tcn_scores, test_token_scores], dim=1
    )
    final_preds = fusion(test_features).squeeze()

y_pred = (final_preds > 0.5).int().numpy()
y_true = y_test.int().numpy()

print("\n===== FINAL RESULTS (50,000 SAMPLES) =====")
print("Accuracy :", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall   :", recall_score(y_true, y_pred))


Dataset loaded
Total samples used: 50000
Train size: 40000
Test size : 10000
Vocabulary size: 100

Training TCN model...
Epoch 1 | Loss: 0.7372
Epoch 2 | Loss: 0.7151
Epoch 3 | Loss: 0.6936
Epoch 4 | Loss: 0.6723
Epoch 5 | Loss: 0.6507

Training Hybrid Fusion model...
Epoch 1 | Loss: 0.5781
Epoch 2 | Loss: 0.5687
Epoch 3 | Loss: 0.5595
Epoch 4 | Loss: 0.5505
Epoch 5 | Loss: 0.5417

===== FINAL RESULTS (50,000 SAMPLES) =====
Accuracy : 0.9249
Precision: 0.0
Recall   : 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
