In [13]:
!pip list

Package                      Version
---------------------------- --------------------
absl-py                      2.2.2
aiofiles                     24.1.0
aiogram                      3.20.0.post0
aiohappyeyeballs             2.6.1
aiohttp                      3.11.18
aiosignal                    1.3.2
annotated-types              0.7.0
anyio                        4.9.0
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.3.0
asttokens                    3.0.0
astunparse                   1.6.3
async-lru                    2.0.5
async-timeout                5.0.1
attrs                        25.3.0
babel                        2.17.0
beautifulsoup4               4.13.3
bleach                       6.2.0
cachetools                   5.5.2
certifi                      2025.1.31
cffi                         1.17.1
charset-normalizer           3.4.1
click                        8.1.8
colorama                     0.4.6
comm              


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# 1. Загрузка данных
data = pd.read_csv("Malicious-URLs.csv") 
#Удаление дубликатов
data = data.drop_duplicates(subset="url").dropna()
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,phishing
1,1,mp3raid.com/music/krizz_kaliko.html,benign
2,2,bopsecrets.org/rexroth/cr/1.htm,benign
3,3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...,...
14995,14995,fortdefianceind.com/,benign
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,benign
14997,14997,burbankairport.com/parking/buses-trains.html,benign
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,benign


In [2]:
# 2. Преобразование меток
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,3
1,1,mp3raid.com/music/krizz_kaliko.html,0
2,2,bopsecrets.org/rexroth/cr/1.htm,0
3,3,http://www.garage-pirenne.be/index.php?option=...,1
4,4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...,...
14995,14995,fortdefianceind.com/,0
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,0
14997,14997,burbankairport.com/parking/buses-trains.html,0
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,0


In [3]:
# 3. Параметры токенизации
MAX_WORDS = 10000  # Максимальное количество токенов
MAX_LEN = 150      # Максимальная длина последовательности


In [4]:
# 4. Токенизация URL на уровне символов
tokenizer = Tokenizer(
    num_words=MAX_WORDS, 
    filters='', 
    lower=True, 
    char_level=True  # Токенизация по символам
)
tokenizer.fit_on_texts(data['url'])
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,3
1,1,mp3raid.com/music/krizz_kaliko.html,0
2,2,bopsecrets.org/rexroth/cr/1.htm,0
3,3,http://www.garage-pirenne.be/index.php?option=...,1
4,4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...,...
14995,14995,fortdefianceind.com/,0
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,0
14997,14997,burbankairport.com/parking/buses-trains.html,0
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,0


In [5]:
# 5. Преобразование текста в последовательности
sequences = tokenizer.texts_to_sequences(data['url'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = tf.keras.utils.to_categorical(data['label'])
data

Unnamed: 0,count,url,label
0,0,br-icloud.com.br,3
1,1,mp3raid.com/music/krizz_kaliko.html,0
2,2,bopsecrets.org/rexroth/cr/1.htm,0
3,3,http://www.garage-pirenne.be/index.php?option=...,1
4,4,http://adventure-nicaragua.net/index.php?optio...,1
...,...,...,...
14995,14995,fortdefianceind.com/,0
14996,14996,bookrags.com/lens/go.php?u=Hannah_Webster_Foster,0
14997,14997,burbankairport.com/parking/buses-trains.html,0
14998,14998,http://otomoto.pl/oferta/subaru-outback-legacy...,0


In [6]:
# 6. Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=data['label'],
    random_state=42
)

In [7]:
# 7. Создание модели LSTM
model = Sequential()
model.add(Embedding(
    input_dim=MAX_WORDS, 
    output_dim=128, 
    input_length=MAX_LEN
))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)




In [25]:
# 8. Обучение модели
from sklearn.utils.class_weight import compute_class_weight
history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=128,
    validation_split=0.2,
    class_weight=dict(zip(  # Автовзвешивание классов
        range(len(le.classes_)), 
        compute_class_weight('balanced', classes=np.unique(data['label']), y=data['label'])
    ))
)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [26]:
# 9. Оценка модели
y_pred = model.predict(X_test)
print(classification_report(
    np.argmax(y_test, axis=1), 
    np.argmax(y_pred, axis=1),
    target_names=le.classes_
))


              precision    recall  f1-score   support

      benign       0.99      0.86      0.92      2199
  defacement       0.72      0.83      0.77       545
     malware       0.25      0.64      0.36        61
    phishing       0.61      0.94      0.74       182

    accuracy                           0.86      2987
   macro avg       0.64      0.82      0.70      2987
weighted avg       0.90      0.86      0.87      2987



In [34]:
# 10. Функция для предсказания
def predict_url_type(url):
    try:
        # Токенизация
        sequence = tokenizer.texts_to_sequences([url])
        padded = pad_sequences(sequence, maxlen=MAX_LEN)
        
        # Предсказание
        proba = model.predict(padded)[0]
        return {
            "url": url,
            "predicted_class": le.inverse_transform([np.argmax(proba)])[0],
            "probabilities": dict(zip(le.classes_, np.round(proba, 3)))
        }
    except Exception as e:
        return {"error": str(e)}

# Пример использования
print(predict_url_type("http://100linksdvgpn.avafedors.freedomain.thehost.com.ua"))
print(predict_url_type("https://kaggle.com"))
print(predict_url_type('https://web.telegram.org'))

{'url': 'http://100linksdvgpn.avafedors.freedomain.thehost.com.ua', 'predicted_class': 'phishing', 'probabilities': {'benign': 0.039, 'defacement': 0.093, 'malware': 0.342, 'phishing': 0.527}}
{'url': 'https://kaggle.com', 'predicted_class': 'benign', 'probabilities': {'benign': 0.927, 'defacement': 0.063, 'malware': 0.006, 'phishing': 0.004}}
{'url': 'https://web.telegram.org', 'predicted_class': 'benign', 'probabilities': {'benign': 0.851, 'defacement': 0.112, 'malware': 0.031, 'phishing': 0.006}}


In [15]:
model.save('url_classifier_LSTM.keras')
model.compiled_metrics == None

  return saving_lib.save_model(model, filepath)


False

In [10]:
from keras.models import load_model

m1 = load_model('url_classifier_LSTM.keras')

In [14]:
with open('tokenizer.pkl', 'wb') as out:
    pickle.dump(tokenizer, out)

In [16]:
with open('LabelEncoder.pkl', 'wb') as out:
    pickle.dump(le, out)