# URL Classification with Random Forest
This notebook implements URL classification (malicious URL detection) using Random Forest algorithm.
It includes comprehensive feature engineering and model evaluation.

## 1. Import Libraries

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from urllib.parse import urlparse
from tld import get_tld
import os.path

# Machine Learning libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix
import joblib

# Suppress warnings
import warnings
from wordcloud import WordCloud
warnings.filterwarnings('ignore')

## 2. Load Dataset

In [None]:
# Load the Dataset
df = pd.read_csv('../artifacts/dataset/malicious_phish.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nDataset columns: {df.columns.tolist()}")
print(f"\nDataset head:")
df.head()

In [None]:
# Check data types and null values
print("Data Info:")
df.info()
print("\nNull values:")
print(df.isnull().sum())
print("\nTarget distribution:")
print(df['type'].value_counts())

## 3. Feature Engineering
All feature engineering functions are preserved from the original notebook.

### 3.1 IP Address Detection

In [None]:
# Use of IP or not in domain
def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    
    if match:
        return 1
    else:
        return 0

df['use_of_ip'] = df['url'].apply(lambda i: having_ip_address(i))

### 3.2 Abnormal URL

In [None]:
def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:      
        return 1
    else:
        return 0

df['abnormal_url'] = df['url'].apply(lambda i: abnormal_url(i))

### 3.3 Google Index (Simplified for performance)

In [None]:
# Simplified google_index - set all to 1 for performance
# In production, you would use actual Google Search API
df['google_index'] = 1

### 3.4 Count Features

In [None]:
# Count dot (.)
def count_dot(url):
    count_dot = url.count('.')
    return count_dot

df['count.'] = df['url'].apply(lambda i: count_dot(i))

# Count www
def count_www(url):
    url.count('www')
    return url.count('www')

df['count-www'] = df['url'].apply(lambda i: count_www(i))

# Count @
def count_atrate(url):
    return url.count('@')

df['count@'] = df['url'].apply(lambda i: count_atrate(i))

# Count directories
def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')

df['count_dir'] = df['url'].apply(lambda i: no_of_dir(i))

# Count embedded domains
def no_of_embed(url):
    urldir = urlparse(url).path
    return urldir.count('//')

df['count_embed_domain'] = df['url'].apply(lambda i: no_of_embed(i))

### 3.5 Suspicious Words Detection

In [None]:
def suspicious_words(url):
    match = re.search('PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr',
                      url)
    if match:
        return 1
    else:
        return 0
    
df['sus_url'] = df['url'].apply(lambda i: suspicious_words(i))

### 3.6 URL Shortening Service Detection

In [None]:
def shortening_service(url):
    match = re.search(r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      r'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      r'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      r'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      r'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      r'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      r'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      r'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0
    
df['short_url'] = df['url'].apply(lambda i: shortening_service(i))

### 3.7 Protocol Counts

In [None]:
# Count https
def count_https(url):
    return url.count('https')

df['count_https'] = df['url'].apply(lambda i: count_https(i))

# Count http
def count_http(url):
    return url.count('http')

df['count_http'] = df['url'].apply(lambda i: count_http(i))

### 3.8 Special Character Counts

In [None]:
# Count %
def count_per(url):
    return url.count('%')

df['count%'] = df['url'].apply(lambda i: count_per(i))

# Count ?
def count_ques(url):
    return url.count('?')

df['count?'] = df['url'].apply(lambda i: count_ques(i))

# Count -
def count_hyphen(url):
    return url.count('-')

df['count-'] = df['url'].apply(lambda i: count_hyphen(i))

# Count =
def count_equal(url):
    return url.count('=')

df['count='] = df['url'].apply(lambda i: count_equal(i))

### 3.9 Length Features

In [None]:
# URL length
def url_length(url):
    return len(str(url))

df['url_length'] = df['url'].apply(lambda i: url_length(i))

# Hostname length
def hostname_length(url):
    return len(urlparse(url).netloc)

df['hostname_length'] = df['url'].apply(lambda i: hostname_length(i))

### 3.10 Domain Features

In [None]:
# First directory length
def fd_length(url):
    urlpath = urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

df['fd_length'] = df['url'].apply(lambda i: fd_length(i))

# TLD length
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

df['tld'] = df['url'].apply(lambda i: get_tld(i, fail_silently=True))
df['tld_length'] = df['tld'].apply(lambda i: tld_length(i))
df = df.drop("tld", axis=1)

### 3.11 Character Counts

In [None]:
# Count digits
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits

df['count_digits'] = df['url'].apply(lambda i: digit_count(i))

# Count letters
def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters

df['count_letters'] = df['url'].apply(lambda i: letter_count(i))

## 4. Label Encoding

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
df['type_code'] = label_encoder.fit_transform(df['type'])

print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label}: {i}")

print("\nTarget distribution after encoding:")
print(df['type_code'].value_counts())

## 5. Feature and Target Creation

In [None]:
# Create feature matrix X and target vector Y
X = df[['use_of_ip','abnormal_url', 'count.', 'count-www', 'count@',
       'count_dir', 'count_embed_domain', 'short_url', 'count%', 'count?', 
       'count-', 'count=', 'url_length', 'count_https', 'count_http', 
       'hostname_length', 'sus_url', 'fd_length', 'tld_length', 'count_digits',
       'count_letters']]

Y = df['type_code']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {Y.shape}")
print(f"\nFeatures: {X.columns.tolist()}")

## 6. Train-Test Split

In [None]:
# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nTraining target distribution:")
print(Y_train.value_counts())
print(f"\nTest target distribution:")
print(Y_test.value_counts())

## 7. Random Forest Model

In [None]:
# Initialize Random Forest with basic parameters
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

print("Training Random Forest model...")
rf_model.fit(X_train, Y_train)
print("Training complete!")

## 8. Model Evaluation

In [None]:
# Make predictions
Y_pred_train = rf_model.predict(X_train)
Y_pred_test = rf_model.predict(X_test)

# Calculate metrics
train_accuracy = accuracy_score(Y_train, Y_pred_train) * 100
test_accuracy = accuracy_score(Y_test, Y_pred_test) * 100

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")

# Detailed classification report
print("\n" + "="*50)
print("Classification Report (Test Set)")
print("="*50)
print(classification_report(Y_test, Y_pred_test, target_names=label_encoder.classes_))

## 9. Confusion Matrix

In [None]:
# Create confusion matrix
cm = confusion_matrix(Y_test, Y_pred_test)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Random Forest', fontsize=16)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.show()

# Print confusion matrix values
print("Confusion Matrix:")
print(cm)

## 10. Feature Importance Analysis

In [None]:
# Get feature importances
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 8))
plt.barh(feature_importances['feature'][:15], feature_importances['importance'][:15])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 15 Feature Importances - Random Forest', fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
print(feature_importances.head(10))

## 11. Model Export

In [None]:
# Save the model
model_filename = 'best_rf_model.pkl'
joblib.dump(rf_model, model_filename)
print(f"Model saved as: {model_filename}")

# Save the label encoder
encoder_filename = 'label_encoder_rf.pkl'
joblib.dump(label_encoder, encoder_filename)
print(f"Label encoder saved as: {encoder_filename}")

# Save model performance metrics
metrics = {
    'model': 'Random Forest',
    'train_accuracy': train_accuracy,
    'test_accuracy': test_accuracy,
    'n_estimators': 100,
    'max_depth': 'None',
    'random_state': 42
}

metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('rf_model_metrics.csv', index=False)
print(f"Metrics saved to: rf_model_metrics.csv")

## 12. Summary

In [None]:
print("="*60)
print("RANDOM FOREST MODEL SUMMARY")
print("="*60)
print(f"Algorithm: Random Forest Classifier")
print(f"Number of Trees: 100")
print(f"Max Depth: None (unlimited)")
print(f"Number of Features: {X.shape[1]}")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Test Samples: {X_test.shape[0]}")
print(f"\nPerformance:")
print(f"  - Training Accuracy: {train_accuracy:.2f}%")
print(f"  - Test Accuracy: {test_accuracy:.2f}%")
print(f"\nModel saved as: {model_filename}")
print("="*60)