# Usage Model

In [157]:
import joblib
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from urllib.parse import urlparse
from googlesearch import search
from tld import get_tld

# ML
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Model
MODEL_PATH = "../code/exports/models"

TARGET_MODEL = "svm_flag_model.pkl"
SCALER_FILE = "svm_flag_scaler.pkl"

# Lebel
LEBEL_PATH = "../code/exports/labels"
TARGET_LEBEL = "label_encoder_svm_flag.pkl"

model = joblib.load(f"{MODEL_PATH}/{TARGET_MODEL}")
scaler = joblib.load(f"{MODEL_PATH}/{SCALER_FILE}")
lebel = joblib.load(f"{LEBEL_PATH}/{TARGET_LEBEL}")

print(model)
print(scaler)
print(lebel)

# Data
DATA_PATH = "../artifacts/dataset_cleanup/data_cleaned.csv"

dataset = pd.read_csv(f"{DATA_PATH}")

print(dataset[dataset['isMalicious'] == False].head())
print(dataset[dataset['isMalicious'] == True].head())

SVC(probability=True, random_state=42)
StandardScaler()
LabelEncoder()
                                         url type  isMalicious
0                     https://dr-carte.info/  NaN        False
1                 https://www.dr-carte.info/  NaN        False
2  https://teal-usability-494962.framer.app/  NaN        False
3                    http://iokjdh.pages.dev  NaN        False
4                  https://iokjdh.pages.dev/  NaN        False
                                                     url type  isMalicious
7766                     https://material-badge.surge.sh  NaN         True
46667                     https://js.joomoom.cc/riben.js  NaN         True
47794  https://tsskpk.com/wp-content/plugins/wp-confi...  NaN         True
51550                       https://lin01.bid/sw/w_11.js  NaN         True
52272                   http://2.35.160.247:50157/bin.sh  NaN         True


## Input

In [158]:
# url_input = "www.google.com"
# url_input = "http://43.156.0.130/v3/signin/identifier"
url_input = " https://material-badge.surge.sh"

df = pd.DataFrame([])

## Preprocessing

### 3.1 IP Address Detection

In [159]:
# Use of IP or not in domain
def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)  # Ipv6
    
    if match:
        return 1
    else:
        return 0

df = pd.DataFrame([{"url": url_input}])

df['use_of_ip'] = df['url'].apply(having_ip_address)

print(df)

                                url  use_of_ip
0   https://material-badge.surge.sh          0


In [160]:
df[df['use_of_ip'] == 1]

Unnamed: 0,url,use_of_ip


### 3.2 Abnormal URL

In [161]:
def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:      
        return 1
    else:
        return 0

df['abnormal_url'] = df['url'].apply(lambda i: abnormal_url(i))
print(df['abnormal_url'].value_counts())
print(df)

abnormal_url
1    1
Name: count, dtype: int64
                                url  use_of_ip  abnormal_url
0   https://material-badge.surge.sh          0             1


### 3.3 Google Index

In [162]:
def google_index(url):
    
    site = search(url, 5)
    return 1 if site else 0

df['google_index'] = df['url'].apply(lambda i: google_index(i))

In [163]:
print(df['url'][0])

for j in search(df['url'][0], num_results=10):
    print(j)

 https://material-badge.surge.sh


In [164]:
df['google_index'].value_counts()
print(df['google_index'].value_counts())
print(df)

google_index
1    1
Name: count, dtype: int64
                                url  use_of_ip  abnormal_url  google_index
0   https://material-badge.surge.sh          0             1             1


### 3.4 Count Features

In [165]:
# Count dot (.)
def count_dot(url):
    count_dot = url.count('.')
    return count_dot

df['count_.'] = df['url'].apply(lambda i: count_dot(i))

# Count www
def count_www(url):
    url.count('www')
    return url.count('www')

df['count_www'] = df['url'].apply(lambda i: count_www(i))

# Count @
def count_atrate(url):
    return url.count('@')

df['count_@'] = df['url'].apply(lambda i: count_atrate(i))

# Count directories
def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')

df['count_dir'] = df['url'].apply(lambda i: no_of_dir(i))

# Count embedded domains
def no_of_embed(url):
    urldir = urlparse(url).path
    return urldir.count('//')

df['count_embed_domain'] = df['url'].apply(lambda i: no_of_embed(i))



### 3.5 Suspicious Words Detection

In [166]:
def suspicious_words(url):
    match = re.search('PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr',
                      url)
    if match:
        return 1
    else:
        return 0
    
df['sus_url'] = df['url'].apply(lambda i: suspicious_words(i))

In [167]:
df['sus_url'].value_counts()
print(df['sus_url'].value_counts())
print(df)

sus_url
0    1
Name: count, dtype: int64
                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  
0        2          0        0          0                   0        0  


### 3.6 URL Shortening Service Detection

In [168]:
def shortening_service(url):
    match = re.search(r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      r'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      r'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      r'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      r'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      r'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      r'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      r'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0
    
df['short_url'] = df['url'].apply(lambda i: shortening_service(i))

In [169]:
df['short_url'].value_counts()
print(df['short_url'].value_counts())
print(df)

short_url
0    1
Name: count, dtype: int64
                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  \
0        2          0        0          0                   0        0   

   short_url  
0          0  


### 3.7 Protocol Counts

In [170]:
# Count https
def count_https(url):
    return url.count('https')

df['count_https'] = df['url'].apply(lambda i: count_https(i))

# Count http
def count_http(url):
    return url.count('http')

df['count_http'] = df['url'].apply(lambda i: count_http(i))

In [171]:
df['count_http'].value_counts()
print(df['count_http'].value_counts())
print(df)

count_http
1    1
Name: count, dtype: int64
                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  \
0        2          0        0          0                   0        0   

   short_url  count_https  count_http  
0          0            1           1  


### 3.8 Special Character Counts

In [172]:
# Count %
def count_per(url):
    return url.count('%')

df['count%'] = df['url'].apply(lambda i: count_per(i))

# Count ?
def count_ques(url):
    return url.count('?')

df['count?'] = df['url'].apply(lambda i: count_ques(i))

# Count -
def count_hyphen(url):
    return url.count('-')

df['count-'] = df['url'].apply(lambda i: count_hyphen(i))

# Count =
def count_equal(url):
    return url.count('=')

df['count='] = df['url'].apply(lambda i: count_equal(i))

In [173]:
print(df)

                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  \
0        2          0        0          0                   0        0   

   short_url  count_https  count_http  count%  count?  count-  count=  
0          0            1           1       0       0       1       0  


### 3.9 Length Features

In [174]:
# URL length
def url_length(url):
    return len(str(url))

df['url_length'] = df['url'].apply(lambda i: url_length(i))

# Hostname length
def hostname_length(url):
    return len(urlparse(url).netloc)

df['hostname_length'] = df['url'].apply(lambda i: hostname_length(i))

In [175]:
print(df)

                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  \
0        2          0        0          0                   0        0   

   short_url  count_https  count_http  count%  count?  count-  count=  \
0          0            1           1       0       0       1       0   

   url_length  hostname_length  
0          32               23  


### 3.10 Domain Features

In [176]:
# First directory length
def fd_length(url):
    urlpath = urlparse(url).path
    try:
        return len(urlpath.split('/')[1])
    except:
        return 0

df['fd_length'] = df['url'].apply(lambda i: fd_length(i))

# TLD length
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

df['tld'] = df['url'].apply(lambda i: get_tld(i, fail_silently=True))
df['tld_length'] = df['tld'].apply(lambda i: tld_length(i))
df = df.drop("tld", axis=1)

In [177]:
print(df)

                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  ...  \
0        2          0        0          0                   0        0  ...   

   count_https  count_http  count%  count?  count-  count=  url_length  \
0            1           1       0       0       1       0          32   

   hostname_length  fd_length  tld_length  
0               23          0           2  

[1 rows x 21 columns]


### 3.11 Character Counts

In [178]:
# Count digits
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits

df['count_digits'] = df['url'].apply(lambda i: digit_count(i))

# Count letters
def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters

df['count_letters'] = df['url'].apply(lambda i: letter_count(i))

In [179]:
print(df.head())

                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  ...  \
0        2          0        0          0                   0        0  ...   

   count%  count?  count-  count=  url_length  hostname_length  fd_length  \
0       0       0       1       0          32               23          0   

   tld_length  count_digits  count_letters  
0           2             0             25  

[1 rows x 23 columns]


## Scale features

**Example error**

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- google_index
- url

In [180]:
print(df)

                                url  use_of_ip  abnormal_url  google_index  \
0   https://material-badge.surge.sh          0             1             1   

   count_.  count_www  count_@  count_dir  count_embed_domain  sus_url  ...  \
0        2          0        0          0                   0        0  ...   

   count%  count?  count-  count=  url_length  hostname_length  fd_length  \
0       0       0       1       0          32               23          0   

   tld_length  count_digits  count_letters  
0           2             0             25  

[1 rows x 23 columns]


In [181]:
df_to_scale = df[scaler.feature_names_in_]

while True:
    try:
        scaled_data = scaler.transform(df_to_scale)
        break
    except ValueError as e:
        msg = str(e)

        if "Feature names unseen at fit time" in msg:
            start = msg.find("Feature names unseen at fit time:") + len("Feature names unseen at fit time:")
            end = msg.find("\n", start)
            unseen_cols = msg[start:end].strip().replace("-", "").split()
            unseen_cols = [c.strip() for c in unseen_cols if c.strip()]
            print("Removing unused columns:", unseen_cols)
            df_to_scale = df_to_scale.drop(columns=unseen_cols, errors='ignore')
        else:
            raise e

print(scaled_data)


[[-0.18181818  1.25723711 -0.20169335 -0.44073589 -0.03875891 -1.43864441
  -0.04569117 -0.25819889 -0.08920958 -0.49589195 -0.14732529 -0.38491076
  -0.60885264  2.90423502  1.20405461  1.55701551 -0.24983394 -0.77383053
   0.65910167 -0.5537983  -0.56834967]]


## Predict

### Validate check

In [182]:
try:
  check_is_fitted(model)
except Exception as e:
  print(e)

  # model.fit(X_train, y_train)

In [183]:
print(model)

numeric_pred = model.predict(scaled_data)
print(numeric_pred)

SVC(probability=True, random_state=42)
[0]


In [184]:
label_pred = lebel.inverse_transform(numeric_pred)
print(label_pred)

[False]
