In [2]:
import pandas as pd
import numpy as np
import joblib
import re
from urllib.parse import urlparse
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# =====================
# DATA PREPROCESSING
# =====================
def calculate_entropy(string):
    """Calculate Shannon entropy of URL string"""
    prob = [float(string.count(c)) / len(string) for c in set(string)]
    return -sum(p * np.log2(p) for p in prob)

def extract_features(url):
    """Extract URL security features"""
    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path

    features = {
        'url_length': len(url),
        'domain_length': len(domain),
        'num_digits': sum(c.isdigit() for c in url),
        'special_chars': sum(url.count(c) for c in ['@', '?', '=', '.', '-', '_', '/']),
        'has_https': 1 if parsed.scheme == 'https' else 0,
        'num_subdomains': domain.count('.') - 1 if domain.count('.') > 1 else 0,
        'path_length': len(path),
        'num_params': path.count('?') + path.count('&'),
        'has_port': 1 if ':' in domain else 0,
        'is_ip': 1 if re.match(r'\d+\.\d+\.\d+\.\d+', domain) else 0,
        'file_extension': 1 if '.' in path.split('/')[-1] else 0,
        'entropy': calculate_entropy(url),
        'num_redirects': url.count('//') - 1,
        'has_phish_keywords': 1 if any(kw in url.lower() for kw in ['login', 'verify', 'secure', 'account']) else 0
    }
    return features

# =====================
# LOAD AND PREPARE DATA
# =====================
try:
    df = pd.read_csv("malicious_phish.csv")
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: File 'malicious_phish.csv' not found in current directory")
    exit()

print("Columns in dataset:", df.columns.tolist())
url_column = 'url'
label_column = 'type'

print("Extracting features from URLs...")
features = df[url_column].apply(extract_features).apply(pd.Series)
X = features.values.astype(np.float32)
y = df[label_column].values

le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, 'label_encoder.pkl')

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

X_train_tensor = torch.tensor(X_train).float().to(device)
y_train_tensor = torch.tensor(y_train).long().to(device)
X_test_tensor = torch.tensor(X_test).float().to(device)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

# =====================
# MODEL DEFINITION
# =====================
class ForestNet(nn.Module):
    def __init__(self, input_size, num_classes, num_trees=300):
        super().__init__()
        self.trees = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_size, 64),
                nn.ReLU(),
                nn.Linear(64, 32),
                nn.ReLU(),
                nn.Linear(32, num_classes)
            ) for _ in range(num_trees)
        ])

    def forward(self, x):
        outputs = [tree(x) for tree in self.trees]
        return torch.stack(outputs).mean(dim=0)

model = ForestNet(
    input_size=X_train.shape[1],
    num_classes=len(le.classes_),
    num_trees=300
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

# =====================
# TRAINING LOOP
# =====================
model.train()
for epoch in range(50):
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# =====================
# EVALUATION
# =====================
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    y_pred = test_outputs.argmax(dim=1).cpu().numpy()

print("\n=== Model Evaluation ===")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# =====================
# SAVE MODEL (weights only)
# =====================
torch.save(model.state_dict(), 'gpu_forest_model.pth')
print("✅ Model weights saved as 'gpu_forest_model.pth'")


Using device: cuda
Dataset loaded successfully.
Columns in dataset: ['url', 'type']
Extracting features from URLs...
Epoch 1, Loss: 0.4473
Epoch 11, Loss: 0.2543
Epoch 21, Loss: 0.2295
Epoch 31, Loss: 0.2599
Epoch 41, Loss: 0.2664

=== Model Evaluation ===
              precision    recall  f1-score   support

      benign       0.90      0.98      0.94     85621
  defacement       0.91      0.97      0.94     19292
     malware       0.95      0.84      0.89      6504
    phishing       0.86      0.48      0.62     18822

    accuracy                           0.90    130239
   macro avg       0.90      0.82      0.85    130239
weighted avg       0.90      0.90      0.89    130239

✅ Model weights saved as 'gpu_forest_model.pth'
