# Model Prediction

In [56]:
import joblib
import pandas as pd
import numpy as np
import pickle
from googlesearch import search

from urllib.parse import urlparse
import re

# import mlflow.skilearn

model_path = "../model/code/exports/models"

### Load Model

In [32]:
import joblib
import pandas as pd
import numpy as np

# Load and analyze model
model = joblib.load(model_path + "/best_svm_model.pkl")
print(f"========== Model ==========")
print(f"Type: {type(model)}")
print(f"Class: {model.__class__.__name__}")
print(f"Features expected: {model.n_features_in_}")
print(f"Classes: {model.classes_}")
print(f"Number of classes: {len(model.classes_)}")
print(f"Support vectors shape: {model.support_vectors_.shape}")
print(f"Number of support vectors per class: {model.n_support_}")

print(f"\n========== Parameter ==========")
params = model.get_params()
for key, value in params.items():
    print(f"{key}: {value}")

Type: <class 'sklearn.svm._classes.SVC'>
Class: SVC
Features expected: 21
Classes: [0 1 2 3]
Number of classes: 4
Support vectors shape: (80117, 21)
Number of support vectors per class: [28128 10846  7482 33661]

C: 1.0
break_ties: False
cache_size: 200
class_weight: None
coef0: 0.0
decision_function_shape: ovr
degree: 3
gamma: scale
kernel: rbf
max_iter: -1
probability: True
random_state: 42
shrinking: True
tol: 0.001
verbose: False

Dummy data shape: (1, 21)
Sample of dummy data: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]



In [37]:
print(f"Kernel: {model.kernel}")
print(f"Gamma: {model.gamma}")
print(f"C (regularization): {model.C}")
print(f"Probability estimation: {model.probability}")
print(f"Decision function shape: {model.decision_function_shape}")

Kernel: rbf
Gamma: scale
C (regularization): 1.0
Probability estimation: True
Decision function shape: ovr


### Avaliable Method

In [42]:
methods = [method for method in dir(model) if not method.startswith('_')]
print(methods)

# for method in dir(model):
#   if not method.startswith('_'):
#     print(method)

['C', 'break_ties', 'cache_size', 'class_weight', 'class_weight_', 'classes_', 'coef0', 'coef_', 'decision_function', 'decision_function_shape', 'degree', 'dual_coef_', 'epsilon', 'fit', 'fit_status_', 'gamma', 'get_metadata_routing', 'get_params', 'intercept_', 'kernel', 'max_iter', 'n_features_in_', 'n_iter_', 'n_support_', 'nu', 'predict', 'predict_log_proba', 'predict_proba', 'probA_', 'probB_', 'probability', 'random_state', 'score', 'set_fit_request', 'set_params', 'set_score_request', 'shape_fit_', 'shrinking', 'support_', 'support_vectors_', 'tol', 'unused_param', 'verbose']


### Feature Extraction

In [12]:
import re
from urllib.parse import urlparse
from googlesearch import search
import numpy as np

class FeatureExtraction:
    def __init__(self, url: str):
        self.url = url
        self.parsed = urlparse(url)
        self.hostname = str(self.parsed.hostname) if self.parsed.hostname else ""
        self.tld = self.hostname.split('.')[-1] if '.' in self.hostname else ""

    def having_ip_address(self):
      match = re.search(
          '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
          '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
          '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
          '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', self.url)  # Ipv6
      
      if match:
          return 1
      else:
          return 0

    def abnormal_url(self):
        return 1 if re.search(self.hostname, self.url) else 0

    def google_index(self):
        try:
            site = search(self.url, num_results=5)
            return 1 if site else 0
        except Exception:
            return 0

    def count_dot(self):
        return self.url.count('.')

    def count_www(self):
        return self.url.count("www")

    def count_atrate(self):
        return self.url.count('@')

    def no_of_dir(self):
        return self.parsed.path.count('/')

    def no_of_embed(self):
        return self.parsed.path.count('//')

    def suspicious_words(self):
        match = re.search(r'PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr',
                          self.url, re.IGNORECASE)
        return 1 if match else 0

    def shortening_service(self):
        match = re.search(r'bit\.ly|goo\.gl|shorte\.st|x\.co|ow\.ly|t\.co|tinyurl|is\.gd|'
                          r'cli\.gs|yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|'
                          r'snipurl\.com|short\.to|BudURL\.com|ping\.fm|post\.ly|'
                          r'qr\.ae|adf\.ly|bitly\.com|cur\.lv|ity\.im|q\.gs|po\.st|'
                          r'bc\.vc|j\.mp|cutt\.us|u\.bb|v\.gd|tr\.im|link\.zip\.net',
                          self.url)
        return 1 if match else 0

    def count_https(self):
        return self.url.count("https")

    def count_http(self):
        return self.url.count("http")

    def count_per(self):
        return self.url.count('%')

    def count_ques(self):
        return self.url.count('?')

    def count_hyphen(self):
        return self.url.count('-')

    def count_equal(self):
        return self.url.count('=')

    def url_length(self):
        return len(self.url)

    def hostname_length(self):
        return len(self.parsed.netloc)

    def fd_length(self):
        try:
            return len(self.parsed.path.split('/')[1])
        except IndexError:
            return 0

    def tld_length(self):
        return len(self.tld) if self.tld else 0

    def digit_count(self):
        return sum(c.isdigit() for c in self.url)

    def letter_count(self):
        return sum(c.isalpha() for c in self.url)

    def get_features(self):
        """
        Returns a dictionary of features matching the desired list for the model.
        """
        features = {
            "use_of_ip": self.having_ip_address(),
            "abnormal_url": self.abnormal_url(),
            "count_.": self.count_dot(),
            "count_www": self.count_www(),
            "count_@": self.count_atrate(),
            "count_dir": self.no_of_dir(),
            "count_embed_domain": self.no_of_embed(),
            "short_url": self.shortening_service(),
            "count%": self.count_per(),
            "count?": self.count_ques(),
            "count-": self.count_hyphen(),
            "count=": self.count_equal(),
            "url_length": self.url_length(),
            "count_https": self.count_https(),
            "count_http": self.count_http(),
            "hostname_length": self.hostname_length(),
            "sus_url": self.suspicious_words(),
            "fd_length": self.fd_length(),
            "tld_length": self.tld_length(),
            "count_digits": self.digit_count(),
            "count_letters": self.letter_count()
        }
        return features

    def get_feature_array(self) -> np.ndarray:
        feature_dict = self.get_features()
        
        feature_order = [
            'use_of_ip', 'abnormal_url', 'count_.', 'count_www', 'count_@', 'count_dir', 
            'count_embed_domain', 'short_url', 'count%', 'count?', 'count-', 'count=', 
            'url_length', 'count_https', 'count_http', 'hostname_length', 'sus_url', 
            'fd_length', 'tld_length', 'count_digits', 'count_letters'
        ]
        
        feature_values = [feature_dict[key] for key in feature_order]
        
        return np.array(feature_values, dtype=np.float32).reshape(1, -1)

    def __repr__(self):
        return f"FeatureExtraction(url={self.url})"

In [14]:
url = "http://example.com/login?user=test"
features = FeatureExtraction(url)

print(features)
print(features.get_features())
print(features.get_feature_array())

FeatureExtraction(url=http://example.com/login?user=test)
{'use_of_ip': 0, 'abnormal_url': 1, 'count_.': 1, 'count_www': 0, 'count_@': 0, 'count_dir': 1, 'count_embed_domain': 0, 'short_url': 0, 'count%': 0, 'count?': 1, 'count-': 0, 'count=': 1, 'url_length': 34, 'count_https': 0, 'count_http': 1, 'hostname_length': 11, 'sus_url': 1, 'fd_length': 5, 'tld_length': 3, 'count_digits': 0, 'count_letters': 27}
[[ 0.  1.  1.  0.  0.  1.  0.  0.  0.  1.  0.  1. 34.  0.  1. 11.  1.  5.
   3.  0. 27.]]


### Prediction

In [33]:
print(f"\n========== Test Predict ==========")

n_features = model.n_features_in_
dummy_data = np.zeros((1, n_features))

print(f"Dummy data shape: {dummy_data.shape}")
print(f"Sample of dummy data: {dummy_data}\n")



Dummy data shape: (1, 21)
Sample of dummy data: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]



In [50]:

try:
    # isMalicious
    # label_mapping = {
    #     0: False,
    #     1: True
    # }

    label_mapping = {
        0: "Benign",
        1: "Defacement",
        2: "Malware",
        3: "Phishing"
    }

    prediction = model.predict(dummy_data)[0]
    feature_label = label_mapping.get(prediction)

    print(f"Test prediction: {prediction}")
    print(f"Predicable: {feature_label}")

    # if hasattr(model, 'predict_proba'):
    #     try:
    #         probabilities = model.predict_proba(dummy_data)
    #         print(f"Prediction probabilities: {probabilities[0]}")
    #         print("Class probabilities:")
    #         for i, prob in enumerate(probabilities[0]):
    #             print(f"  Class {model.classes_[i]}: {prob:.4f}")
    #     except Exception as prob_error:
    #         print(f"Probability prediction error: {prob_error}")
            
    # if hasattr(model, 'decision_function'):
    #     try:
    #         decision_scores = model.decision_function(dummy_data)
    #         print(f"Decision function scores: {decision_scores[0]}")
    #         print("Decision scores per class:")
    #         for i, score in enumerate(decision_scores[0]):
    #             print(f"  Class {model.classes_[i]}: {score:.4f}")
    #     except Exception as dec_error:
    #         print(f"Decision function error: {dec_error}")
            
except Exception as e:
    print(f"Prediction error: {e}")
    print(f"Error type: {type(e)}")

Test prediction: 0
Predicable: Benign
