In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import tldextract
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Загрузка данных
data = pd.read_csv("Malicious-URLs.csv")

# Удаление дубликатов и пропусков
data = data.drop_duplicates(subset="url").dropna()

# Проверка баланса классов
print("Распределение классов:")
print(data["label"].value_counts())

Распределение классов:
label
benign        10991
defacement     2727
phishing        912
malware         303
Name: count, dtype: int64


In [6]:
def extract_features(url):
    features = {}
    try:
        # Лексические признаки
        features["url_length"] = len(url)
        features["num_special_chars"] = sum(url.count(c) for c in ['@', '%', '//', '?', '='])
        
        # Анализ домена
        ext = tldextract.extract(url)
        features["subdomain_count"] = len(ext.subdomain.split('.')) if ext.subdomain else 0
        features["domain_length"] = len(ext.domain)
        features["has_ip"] = 1 if any(part.isdigit() for part in ext.domain.split('.')) else 0
        
        # Семантические признаки
        keywords = ['login', 'admin', 'exe', 'php', 'config', 'secure']
        features["keyword_count"] = sum(1 for kw in keywords if kw in url)
        return features
    except Exception as e:
        print(f"Ошибка обработки URL {url}: {e}")
        return None

# Применение функции
features = data["url"].apply(extract_features)
valid_indices = features[features.notna()].index
data = data.loc[valid_indices]
features = features.loc[valid_indices]

# Создание DataFrame с признаками
features_df = pd.DataFrame(features.tolist())
labels = data["label"].values

In [17]:
# Расчет весов классов
classes = np.unique(labels)
class_weights = compute_class_weight("balanced", classes=classes, y=labels)
class_weights_dict = {cls: weight for cls, weight in zip(classes, class_weights)}

# Для XGBoost и LightGBM
scale_pos_weight = class_weights_dict[1] / class_weights_dict[0]

TypeError: 'dict' object is not callable

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    features_df, 
    labels, 
    test_size=0.2, 
    stratify=labels, 
    random_state=42
)

In [9]:
# Инициализация моделей
xgb_model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    random_state=42
)

rf_model = RandomForestClassifier(
    class_weight=class_weights_dict,
    n_estimators=100,
    max_depth=10,
    random_state=42
)

lgbm_model = LGBMClassifier(
    class_weight=class_weights_dict,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    random_state=42
)

# Создание ансамбля (мягкое голосование)
ensemble = VotingClassifier(
    estimators=[
        ("xgb", xgb_model),
        ("rf", rf_model),
        ("lgbm", lgbm_model)
    ],
    voting="soft"  # Используем вероятности классов
)

# Обучение ансамбля
ensemble.fit(X_train, y_train)

NameError: name 'scale_pos_weight' is not defined

In [10]:
# Предсказание
y_pred = ensemble.predict(X_test)
y_proba = ensemble.predict_proba(X_test)[:, 1]  # Вероятность класса 1

# Метрики
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"\nROC-AUC: {roc_auc_score(y_test, y_proba):.2f}")

# Важность признаков (на примере XGBoost)
xgb_model.fit(X_train, y_train)  # Переобучаем XGBoost отдельно для анализа
plt.figure(figsize=(10, 6))
xgb.plot_importance(xgb_model, importance_type="gain")
plt.title("Важность признаков (XGBoost)")
plt.show()

NameError: name 'ensemble' is not defined

In [11]:
def predict_url(url: str, model, threshold: float = 0.5) -> str:
    try:
        # Извлечение признаков
        features = extract_features(url)
        if not features:
            return "Ошибка извлечения признаков."
        
        # Преобразование в DataFrame
        features_df = pd.DataFrame([features])
        
        # Предсказание
        proba = model.predict_proba(features_df)[0][1]
        return "Вредоносный" if proba > threshold else "Доброкачественный"
    except Exception as e:
        return f"Ошибка: {str(e)}"

# Пример использования
print(predict_url("http://phishing.com", ensemble, threshold=0.4))  # Вредоносный
print(predict_url("https://google.com", ensemble))                   # Доброкачественный

NameError: name 'ensemble' is not defined

In [12]:
from sklearn.metrics import precision_recall_curve

# Поиск оптимального порога через Precision-Recall
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
best_threshold = thresholds[np.argmax(precisions >= 0.9)]  # Порог для 90% точности

print(f"\nОптимальный порог: {best_threshold:.2f}")

NameError: name 'y_proba' is not defined