In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = pd.read_csv('model_data.csv')
val_data = pd.read_csv('valid_data.csv')
val_data['url'] = val_data['url'].apply(str)

In [None]:
print(f'정상 url 비율 = {round(data["Label"].value_counts()[0]/len(data) * 100,3)}%')
print(f'피싱 url 비율 = {round(data["Label"].value_counts()[1]/len(data) * 100,3)}%')

In [None]:
print(f'정상 url 비율 = {round(val_data["Label"].value_counts()[0]/len(val_data) * 100,3)}%')
print(f'피싱 url 비율 = {round(val_data["Label"].value_counts()[1]/len(val_data) * 100,3)}%')

In [None]:
#독립 변수
X_data = data['url']

# 종속 변수
y_data = data['Label']

# 검증용 데이터
shuffled_val_data = val_data.sample(frac=1, random_state=0).reset_index(drop=True)
X_val = shuffled_val_data['url']
y_val = shuffled_val_data['Label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=0, stratify=y_data)

In [None]:
# 각 데이터의 크기 확인
print("훈련 데이터 크기:", X_train.shape, y_train.shape)
print("검증 데이터 크기:", X_val.shape, y_val.shape)
print("테스트 데이터 크기:", X_test.shape, y_test.shape)

In [None]:
print('--------훈련 데이터의 비율-----------')
print(f'정상 url = {round(y_train.value_counts()[0]/len(y_train) * 100,3)}%')
print(f'스팸 url = {round(y_train.value_counts()[1]/len(y_train) * 100,3)}%')

print('--------테스트 데이터의 비율-----------')
print(f'정상 url = {round(y_test.value_counts()[0]/len(y_test) * 100,3)}%')
print(f'스팸 url = {round(y_test.value_counts()[1]/len(y_test) * 100,3)}%')

print('--------검증 데이터의 비율-----------')
print(f'정상 url = {round(y_val.value_counts()[0]/len(y_val) * 100,3)}%')
print(f'스팸 url = {round(y_val.value_counts()[1]/len(y_val) * 100,3)}%')

In [None]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer

# URL을 구성 요소로 분리하는 함수
def tokenize_url(url):
    parsed_url = urlparse(url)
    scheme = parsed_url.scheme
    netloc = parsed_url.netloc
    path = parsed_url.path

    # 구성 요소를 리스트로 결합
    url_parts = [scheme] + netloc.split('.') + path.split('/')

    # 빈 문자열 제거
    url_parts = [part for part in url_parts if part]

    return url_parts

# URL 리스트를 구성 요소로 분리
X_train_list = [tokenize_url(url) for url in X_train]

# 구성 요소를 문자열로 결합
X_train_list = [' '.join(parts) for parts in X_train_list]

# Tokenizer 초기화 및 텍스트 적합화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_list)

# 저장
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 텍스트를 시퀀스로 변환
X_train_encoded = tokenizer.texts_to_sequences(X_train_list)

print("토큰화된 URL 구성 요소:", X_train_encoded[:5])

In [None]:
word_to_index = tokenizer.word_index
word_to_index

In [None]:
threshold = 2
total_cnt = len(word_to_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합(vocabulary)에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

In [None]:
vocab_size = len(word_to_index) + 1
print('단어 집합의 크기: {}'.format((vocab_size)))

In [None]:
print('url 최대 길이 : %d' % max(len(sample) for sample in X_train_encoded))
print('url 평균 길이 : %f' % (sum(map(len, X_train_encoded))/len(X_train_encoded)))

In [None]:
max_len = 560
X_train_padded = pad_sequences(X_train_encoded, maxlen = max_len)
print("훈련 데이터의 크기(shape):", X_train_padded.shape)

In [None]:
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 32
dropout_ratio = 0.3
num_filters = 32
kernel_size = 5

from tensorflow.keras.layers import LSTM

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(64, return_sequences=False))  # 64는 유닛 수
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# model = Sequential()
# model.add(Embedding(vocab_size, embedding_dim))
# model.add(Dropout(dropout_ratio))
# model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
# model.add(GlobalMaxPooling1D())
# model.add(Dropout(dropout_ratio))
# model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.keras', monitor = 'val_acc', mode='max', verbose=1, save_best_only=True)

model.summary()

history = model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[es, mc])

Epoch 1/5
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - acc: 0.9294 - loss: 0.1849
Epoch 1: val_acc improved from -inf to 0.97750, saving model to best_model.keras
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m412s[0m 141ms/step - acc: 0.9294 - loss: 0.1849 - val_acc: 0.9775 - val_loss: 0.0709
Epoch 2/5
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - acc: 0.9906 - loss: 0.0298
Epoch 2: val_acc improved from 0.97750 to 0.98239, saving model to best_model.keras
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 139ms/step - acc: 0.9906 - loss: 0.0298 - val_acc: 0.9824 - val_loss: 0.0621
Epoch 3/5
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - acc: 0.9984 - loss: 0.0064
Epoch 3: val_acc did not improve from 0.98239
[1m2900/2900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 140ms/step - acc: 0.9984 - loss: 0.0064 - val_acc: 0.9766 - val_loss: 0

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 1. 테스트 데이터 예측 (확률값)
y_pred_proba = model.predict(X_test_padded).flatten()  # 예측 확률값

# 2. 이진 분류 결과 생성 (0.5 기준)
y_pred = (y_pred_proba >= 0.5).astype(int)

# 3. 성능 지표 계산
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)  # ROC AUC은 확률값 기반

# 4. 결과 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_url(url, tokenizer, max_len=560):
    # Tokenize and convert to string
    tokenized_url = tokenize_url(url)
    url_string = ' '.join(tokenized_url)

    # Encode to sequence
    encoded_url = tokenizer.texts_to_sequences([url_string])

    # Pad sequence
    padded_url = pad_sequences(encoded_url, maxlen=max_len)

    return padded_url

In [None]:
def tokenize_url(url):
    from urllib.parse import urlparse
    parsed_url = urlparse(url)
    scheme = parsed_url.scheme
    netloc = parsed_url.netloc
    path = parsed_url.path

    # 구성 요소 분리 및 리스트 생성
    url_parts = [scheme] + netloc.split('.') + path.split('/')
    url_parts = [part for part in url_parts if part]  # 빈 문자열 제거

    return url_parts

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from scipy.sparse import hstack

# URL 텍스트를 벡터화하기 위해 TF-IDF 적용
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)  # 텍스트 벡터화
X_test_tfidf = vectorizer.transform(X_test)

# LGBM 모델 학습
lgbm_model = LGBMClassifier(max_depth=5, num_leaves=31, learning_rate=0.05, n_estimators=100, random_state=0)
lgbm_model.fit(X_train_tfidf, y_train)

# 테스트 데이터 예측
lgbm_proba = lgbm_model.predict_proba(X_test_tfidf)[:, 1]  # 확률 예측
lgbm_pred = (lgbm_proba >= 0.5).astype(int)  # 클래스 예측

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import numpy as np

# 1. LSTM 모델 예측 확률
lstm_proba = model.predict(X_test_padded).flatten()

# 2. LGBM 모델 예측 확률
lgbm_proba = lgbm_model.predict_proba(X_test_tfidf)[:, 1]

# 3. Stacking 데이터 생성
stacked_train = np.column_stack((lgbm_model.predict_proba(X_train_tfidf)[:, 1],
                                 model.predict(X_train_padded).flatten()))
stacked_test = np.column_stack((lgbm_proba, lstm_proba))

# 4. XGBoost 메타 모델 학습
meta_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, g=0)
meta_model.fit(stacked_train, y_train)

# 5. XGBoost 메타 모델 예측
stacked_pred = meta_model.predict(stacked_test)
stacked_proba = meta_model.predict_proba(stacked_test)[:, 1]

# 6. 평가
accuracy = accuracy_score(y_test, stacked_pred)
f1 = f1_score(y_test, stacked_pred)
precision = precision_score(y_test, stacked_pred)
recall = recall_score(y_test, stacked_pred)
roc_auc = roc_auc_score(y_test, stacked_proba)

print(f"XGBoost Stacking - Accuracy: {accuracy:.4f}")
print(f"XGBoost Stacking - F1 Score: {f1:.4f}")
print(f"XGBoost Stacking - Precision: {precision:.4f}")
print(f"XGBoost Stacking - Recall: {recall:.4f}")
print(f"XGBoost Stacking - ROC AUC: {roc_auc:.4f}")

[1m1813/1813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 32ms/step
[1m7250/7250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 33ms/step
XGBoost Stacking - Accuracy: 0.9865
XGBoost Stacking - F1 Score: 0.9859
XGBoost Stacking - Precision: 0.9934
XGBoost Stacking - Recall: 0.9786
XGBoost Stacking - ROC AUC: 0.9976
