In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import re
from urllib.parse import urlparse
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
# 1. Загрузка и подготовка данных
data = pd.read_csv("Malicious-URLs.csv")  # Предполагаем колонки: url, label

# 2. Преобразование меток в числовой формат
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,3
1,1,mp3raid.com/music/krizz_kaliko.html,0
2,2,bopsecrets.org/rexroth/cr/1.htm,0
3,3,http://www.garage-pirenne.be/index.php?option=...,1
4,4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...,...
14995,14995,fortdefianceind.com/,0
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,0
14997,14997,burbankairport.com/parking/buses-trains.html,0
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,0


In [36]:
#Удаление дубликатов
data = data.drop_duplicates(subset="url").dropna()

In [41]:
# 3. Функция для токенизации URL
def tokenize_url(url):
    try:
        # Удаление протокола и разделение URL на части
        parsed = urlparse(url)
        path = parsed.path
        query = parsed.query
        tokens = []
        
        # Разделение домена на части
        if parsed.netloc:
            tokens += re.split(r'[\.\-]', parsed.netloc)
            
        # Разделение пути и параметров
        tokens += re.split(r'[/&=?]', path + " " + query)
        
        # Фильтрация и очистка токенов
        tokens = [t.lower().strip() for t in tokens if t.strip() != '']
        return ' '.join(tokens)
    
    except:
        return ''


In [42]:
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,3
1,1,mp3raid.com/music/krizz_kaliko.html,0
2,2,bopsecrets.org/rexroth/cr/1.htm,0
3,3,http://www.garage-pirenne.be/index.php?option=...,1
4,4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...,...
14995,14995,fortdefianceind.com/,0
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,0
14997,14997,burbankairport.com/parking/buses-trains.html,0
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,0


In [43]:
# 4. Применение токенизации
data['tokens'] = data['url'].apply(tokenize_url)


In [11]:
# 5. TF-IDF векторизация
tfidf = TfidfVectorizer(
    max_features=15000,  # Ограничение количества фичей
    ngram_range=(1, 15),  # Учитываем 1-граммы и 2-граммы
    stop_words='english'
)
X = tfidf.fit_transform(data['tokens'])
y = data['label']
data['label']

0        3
1        0
2        0
3        1
4        1
        ..
14995    0
14996    0
14997    0
14998    0
14999    0
Name: label, Length: 14933, dtype: int32

In [12]:
X

<14933x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 181023 stored elements in Compressed Sparse Row format>

In [13]:
# 6. Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,
    random_state=42
)

In [14]:
# 7. Обучение XGBoost
model = XGBClassifier(
    objective='multi:softprob',
    num_class=4,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    n_estimators=200,
    random_state=42
)

model.fit(X_train, y_train)

In [15]:
# 8. Оценка модели
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nROC-AUC (OvR):", roc_auc_score(
    y_test, 
    y_proba, 
    multi_class='ovr',
    average='macro'
))


Classification Report:
              precision    recall  f1-score   support

      benign       0.93      0.99      0.96      2199
  defacement       0.96      0.90      0.93       545
     malware       0.95      0.67      0.79        61
    phishing       0.85      0.39      0.53       182

    accuracy                           0.93      2987
   macro avg       0.92      0.74      0.80      2987
weighted avg       0.93      0.93      0.93      2987


ROC-AUC (OvR): 0.966459995733152


In [18]:
# 9. Функция для предсказания
def predict_url_type(url):
    try:
        # Токенизация
        tokens = tokenize_url(url)
        
        # Векторизация
        X_new = tfidf.transform([tokens])
        
        # Предсказание
        proba = model.predict_proba(X_new)[0]
        pred_class = le.inverse_transform([np.argmax(proba)])[0]
        
        return {
            "url": url,
            "predicted_class": pred_class,
            "probabilities": dict(zip(le.classes_, proba.round(3)))
        }
    except Exception as e:
        return {"error": str(e)}

# 10. Пример использования
test_urls = [
    "http://example.com/login.php?user=admin",
    "http://youtube.net.ua",
    "http://phishingsite.com/verify-account",
    "http://100linksdvgpn.avafedors.freedomain.thehost.com.ua",
    "https://github.com/Sanyapushka8/URL/blob/main/URL6.ipynb",
    "https://web.telegram.org/k/",
    "https://kaggle.com"
]

for url in test_urls:
    print(predict_url_type(url))
    print("-" * 50)

{'url': 'http://example.com/login.php?user=admin', 'predicted_class': 'phishing', 'probabilities': {'benign': 0.375, 'defacement': 0.007, 'malware': 0.002, 'phishing': 0.616}}
--------------------------------------------------
{'url': 'http://youtube.net.ua', 'predicted_class': 'benign', 'probabilities': {'benign': 0.824, 'defacement': 0.034, 'malware': 0.026, 'phishing': 0.115}}
--------------------------------------------------
{'url': 'http://phishingsite.com/verify-account', 'predicted_class': 'phishing', 'probabilities': {'benign': 0.356, 'defacement': 0.007, 'malware': 0.004, 'phishing': 0.633}}
--------------------------------------------------
{'url': 'http://100linksdvgpn.avafedors.freedomain.thehost.com.ua', 'predicted_class': 'defacement', 'probabilities': {'benign': 0.418, 'defacement': 0.449, 'malware': 0.01, 'phishing': 0.123}}
--------------------------------------------------
{'url': 'https://github.com/Sanyapushka8/URL/blob/main/URL6.ipynb', 'predicted_class': 'benign'

In [28]:
import joblib
#save model
joblib.dump(XGBClassifier, 'url_classifier.joblib') 

['url_classifier.joblib']

In [33]:
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenize_url, file) 
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(le, file)