In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. Загрузка данных
data = pd.read_csv("Malicious-URLs.csv") 
#Удаление дубликатов
data = data.drop_duplicates(subset="url").dropna()
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,phishing
1,1,mp3raid.com/music/krizz_kaliko.html,benign
2,2,bopsecrets.org/rexroth/cr/1.htm,benign
3,3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...,...
14995,14995,fortdefianceind.com/,benign
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,benign
14997,14997,burbankairport.com/parking/buses-trains.html,benign
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,benign


In [3]:
# 2. Преобразование меток
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,3
1,1,mp3raid.com/music/krizz_kaliko.html,0
2,2,bopsecrets.org/rexroth/cr/1.htm,0
3,3,http://www.garage-pirenne.be/index.php?option=...,1
4,4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...,...
14995,14995,fortdefianceind.com/,0
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,0
14997,14997,burbankairport.com/parking/buses-trains.html,0
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,0


In [4]:
# 3. Параметры токенизации
MAX_WORDS = 10000  # Максимальное количество токенов
MAX_LEN = 150      # Максимальная длина последовательности


In [5]:
# 4. Токенизация URL на уровне символов
tokenizer = Tokenizer(
    num_words=MAX_WORDS, 
    filters='', 
    lower=True, 
    char_level=True  # Токенизация по символам
)
tokenizer.fit_on_texts(data['url'])
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,3
1,1,mp3raid.com/music/krizz_kaliko.html,0
2,2,bopsecrets.org/rexroth/cr/1.htm,0
3,3,http://www.garage-pirenne.be/index.php?option=...,1
4,4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...,...
14995,14995,fortdefianceind.com/,0
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,0
14997,14997,burbankairport.com/parking/buses-trains.html,0
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,0


In [6]:
# 5. Преобразование текста в последовательности
sequences = tokenizer.texts_to_sequences(data['url'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = tf.keras.utils.to_categorical(data['label'])

In [7]:
# 6. Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=data['label'],
    random_state=42
)

In [8]:
# 7. Создание модели LSTM
model = Sequential()
model.add(Embedding(
    input_dim=MAX_WORDS, 
    output_dim=128, 
    input_length=MAX_LEN
))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)


In [17]:
# 8. Обучение модели
history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=128,
    validation_split=0.2,
    class_weight=dict(zip(  # Автовзвешивание классов
        range(len(le.classes_)), 
        compute_class_weight('balanced', classes=np.unique(data['label']), y=data['label'])
    ))
)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [18]:
# 9. Оценка модели
y_pred = model.predict(X_test)
print(classification_report(
    np.argmax(y_test, axis=1), 
    np.argmax(y_pred, axis=1),
    target_names=le.classes_
))


              precision    recall  f1-score   support

      benign       0.98      0.92      0.95      2199
  defacement       0.86      0.84      0.85       545
     malware       0.24      0.72      0.36        61
    phishing       0.86      0.92      0.89       182

    accuracy                           0.90      2987
   macro avg       0.73      0.85      0.76      2987
weighted avg       0.93      0.90      0.91      2987



In [22]:
# 10. Функция для предсказания
def predict_url_type(url):
    try:
        # Токенизация
        sequence = tokenizer.texts_to_sequences([url])
        padded = pad_sequences(sequence, maxlen=MAX_LEN)
        
        # Предсказание
        proba = model.predict(padded)[0]
        return {
            "url": url,
            "predicted_class": le.inverse_transform([np.argmax(proba)])[0],
            "probabilities": dict(zip(le.classes_, np.round(proba, 3)))
        }
    except Exception as e:
        return {"error": str(e)}

# Пример использования
print(predict_url_type("http://100linksdvgpn.avafedors.freedomain.thehost.com.ua"))
print(predict_url_type("https://kaggle.com"))
print(predict_url_type('https://web.telegram.org'))

{'url': 'http://100linksdvgpn.avafedors.freedomain.thehost.com.ua', 'predicted_class': 'malware', 'probabilities': {'benign': 0.143, 'defacement': 0.165, 'malware': 0.658, 'phishing': 0.034}}
{'url': 'https://kaggle.com', 'predicted_class': 'benign', 'probabilities': {'benign': 0.844, 'defacement': 0.059, 'malware': 0.09, 'phishing': 0.007}}
{'url': 'https://web.telegram.org', 'predicted_class': 'benign', 'probabilities': {'benign': 0.505, 'defacement': 0.139, 'malware': 0.347, 'phishing': 0.008}}


In [13]:
from joblib import dump # Сохранение
dump(model, 'url_classifier_LSTM.joblib')

['url_classifier_LSTM.joblib']

In [14]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)