In [None]:


import pandas as pd

# Load dataset
df = pd.read_csv('/kaggle/input/url-classification/data.csv',on_bad_lines='skip')

# Explore the dataset
print(df.head())



In [None]:
#Modify the label column that corresponds bad to 1 and good to 0 for binary classification
df['label'] = [1 if x == 'bad' else 0 for x in df['label']]

In [None]:
import re
import math

def calculate_entropy(url):
    # Count frequency of each character in the URL
    freq = {}
    for char in url:
        freq[char] = freq.get(char, 0) + 1

    # Calculate entropy
    entropy = 0
    length = len(url)
    for count in freq.values():
        probability = count / length
        entropy -= probability * math.log2(probability)

    return entropy

def tokenize_url(url):
    return [ord(char) for char in url]


def extract_features(df):
    # URL length
    df['url_length'] = df['url'].apply(lambda x: len(x))

    # Count special characters in URL
    df['special_char_count'] = df['url'].apply(lambda x: len(re.findall(r'[?&=]', x)))

    # Count number of subdomains
    df['subdomain_count'] = df['url'].apply(lambda x: len(x.split('.')) - 2)

    # Check if an IP address is present in the URL
    df['has_ip'] = df['url'].apply(lambda x: 1 if re.search(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', x) else 0)
    
    # Calculate entropy of the URL
    df['entropy'] = df['url'].apply(calculate_entropy)

    # Check if the URL contains a suspicious name
    suspicious_names = ['confirm', 'account', 'secure', 'banking', 'secure', 'login', 'signin', 'update', 'password', 'verify','free','game','win','click','prize','cash','money','offer','discount','deal','sale','cheap','best','top','amazing','new','hot','popular','trending','sensational','latest','exclusive','limited','urgent','important','breaking','alert','warning','emergency','crisis','critical','fatal','vital','immediate','important','essential','key','necessary','required','compulsory','mandatory','obligatory','pressing','acute','burning','paramount','preeminent','urgent','top-priority','high-priority','crucial']
    df['has_suspicious_name'] = df['url'].apply(lambda x: 1 if any(name in x for name in suspicious_names) else 0)
    
    # Apply tokenization
    df['tokenized_url'] = df['url'].apply(tokenize_url)
    
    return df

df = extract_features(df)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Pad sequences manually
def pad_sequences_torch(sequences, max_len):
    padded_sequences = torch.zeros((len(sequences), max_len), dtype=torch.float32)
    for i, seq in enumerate(sequences):
        seq_len = min(len(seq), max_len)
        padded_sequences[i, :seq_len] = torch.tensor(seq[:seq_len], dtype=torch.float32)
    return padded_sequences

max_len = 75  # Max length of URLs
# Reduce the max_len for memory efficiency
X_rnn = pad_sequences_torch(df['tokenized_url'], max_len)
X_additional = df[['special_char_count', 'subdomain_count', 'has_ip', 'entropy', 'has_suspicious_name']].values

# Encode labels
y = torch.tensor(df['label'].values, dtype=torch.float32)

# Train-test split
X_rnn_train, X_rnn_test, X_additional_train, X_additional_test, y_train, y_test = train_test_split(
    X_rnn, X_additional, y, test_size=0.2, random_state=42
)

# Create TensorDataset that includes both RNN inputs and additional features
train_data = TensorDataset(torch.tensor(X_rnn_train), torch.tensor(X_additional_train, dtype=torch.float32), y_train)
test_data = TensorDataset(torch.tensor(X_rnn_test), torch.tensor(X_additional_test, dtype=torch.float32), y_test)

# Create DataLoader for batching
batch_size = 64  # Adjust based on memory constraints
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Define the RNN model with additional features
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, additional_feature_dim):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim + additional_feature_dim, 128)  # Combine RNN output with additional features
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, rnn_input, additional_features):
        h, _ = self.rnn(rnn_input)
        h = h[:, -1, :]  # Get the last hidden state
        
        # Concatenate RNN output and additional features
        combined_input = torch.cat((h, additional_features), dim=1)
        
        # Pass through fully connected layers
        x = torch.relu(self.fc1(combined_input))
        out = torch.sigmoid(self.fc2(x))
        return out

# Hyperparameters
input_dim = 1  # Single character (ASCII value)
hidden_dim = 64
output_dim = 1  # Binary classification
additional_feature_dim = X_additional_train.shape[1]

# Model, Loss, and Optimizer
model = RNNClassifier(input_dim, hidden_dim, output_dim, additional_feature_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model with batching
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_rnn_batch, X_additional_batch, y_batch in train_loader:
        optimizer.zero_grad()
        
        # Reshape the RNN input to add the input dimension (for LSTM)
        X_rnn_batch = X_rnn_batch.unsqueeze(-1)  # Add input dimension for RNN
        y_batch = y_batch.unsqueeze(-1)  # Reshape labels
        
        # Forward pass
        outputs = model(X_rnn_batch, X_additional_batch)
        loss = criterion(outputs, y_batch)
        
        # Backpropagation and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

# Evaluate the model with batching
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_rnn_batch, X_additional_batch, y_batch in test_loader:
        X_rnn_batch = X_rnn_batch.unsqueeze(-1)  # Add input dimension for RNN
        y_pred = model(X_rnn_batch, X_additional_batch)
        y_pred = (y_pred > 0.5).int()
        
        correct += (y_pred.squeeze() == y_batch).sum().item()
        total += y_batch.size(0)

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')



In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'model.pth')