In [1]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import pandas as pd
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from scipy.sparse import save_npz, load_npz
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [5]:


# URL 전처리 함수
def preprocess_url(url):
    # URL 파싱
    parsed_url = urlparse(url)
    protocol = parsed_url.scheme
    domain = parsed_url.netloc
    path = parsed_url.path
    parameters = parsed_url.params
    fragment = parsed_url.fragment

    # 토큰화
    tokenized_protocol = word_tokenize(protocol)
    tokenized_domain = word_tokenize(domain)
    tokenized_path = word_tokenize(path)
    tokenized_parameters = word_tokenize(parameters)
    tokenized_fragment = word_tokenize(fragment)

    # 정규화
    normalized_domain = domain.lower()
    normalized_domain = re.sub(r'[^a-zA-Z]', '', normalized_domain)  # 특수 문자 및 숫자 제거
    if normalized_domain.startswith('www'):  # 접두사 "www." 제거
        normalized_domain = normalized_domain[4:]

    # 피처 추출
    domain_length = len(normalized_domain)
    protocol_type = protocol if protocol else "Unknown"

    # 결과 반환
    return ' '.join(tokenized_protocol + tokenized_domain + tokenized_path +
                    tokenized_parameters + tokenized_fragment + [normalized_domain, str(domain_length), protocol_type])


# CSV 파일 읽기
data = pd.read_csv('/content/drive/MyDrive/URL_Data.csv')

# 필요한 열 선택 및 중복 제거
data = data[['v1', 'v2']]
data['v1'] = data['v1'].replace(['ham', 'spam'], [0, 1])
data.drop_duplicates(subset=['v2'], inplace=True)
print('중복 제거 후의 전체 샘플 수:', len(data))

# 전처리 수행
preprocessed_data = []
for url in data['v2']:
    preprocessed_url = preprocess_url(url)
    preprocessed_data.append(preprocessed_url)

# 레이블 가져오기
labels = data['v1']

# 단어 카운트 벡터 생성
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_data)

# CountVectorizer 저장
with open('/content/drive/MyDrive/vectorizer.joblib', 'wb') as f:
    pickle.dump(vectorizer, f)

# 전처리된 데이터와 레이블 저장
save_npz('/content/drive/MyDrive/X.npz', X)  # 희소 행렬을 .npz 형식으로 저장
np.save('/content/drive/MyDrive/labels.npy', labels.values)  # 레이블을 .npy 형식으로 저장



중복 제거 후의 전체 샘플 수: 47640


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib
from scipy.sparse import load_npz

# 희소 행렬 데이터와 레이블 로드
X = load_npz('/content/drive/MyDrive/X.npz')
labels = np.load('/content/drive/MyDrive/labels.npy', allow_pickle=True)

# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# RandomForest 분류기 인스턴스화 및 모델 학습
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
predictions = rf_classifier.predict(X_test)

# 모델 성능 평가
print("정확도:", accuracy_score(y_test, predictions))
print("\n분류 리포트:\n", classification_report(y_test, predictions))

# 모델 저장
with open('/content/drive/MyDrive/rf_classifier.joblib', 'wb') as f:
    pickle.dump(rf_classifier, f)

정확도: 0.9495172124265323

분류 리포트:
               precision    recall  f1-score   support

           0       0.91      0.99      0.95      4542
           1       0.99      0.91      0.95      4986

    accuracy                           0.95      9528
   macro avg       0.95      0.95      0.95      9528
weighted avg       0.95      0.95      0.95      9528

