1. URL 관련 특성 추출 함수들

In [37]:
import re
import math
import socket
import string
from urllib.parse import urlparse
from collections import Counter

def url_len(url):
    return len(url)

def url_num_hyphens_dom(url):
    return urlparse(url).netloc.count('-')

def url_num_dom_token(url):
    return len(urlparse(url).netloc.split('.'))

def url_path_len(url):
    return len(urlparse(url).path)

def url_filename_len(url):
    return len(urlparse(url).path.split('/')[-1])

def url_longest_dom_token_len(url):
    tokens = urlparse(url).netloc.split('.')
    return max(len(t) for t in tokens)

def url_average_dom_token_len(url):
    tokens = urlparse(url).netloc.split('.')
    return sum(len(t) for t in tokens) / len(tokens)

def url_tld(url):
    tokens = urlparse(url).netloc.split('.')
    return tokens[-1] if len(tokens) > 1 else ''

def url_domain_len(url):
    return len(urlparse(url).netloc)

def url_hostname_len(url):
    return len(urlparse(url).hostname or '')

def url_num_dots(url):
    return url.count('.')

def url_num_underscores(url):
    return url.count('_')

def url_num_equals(url):
    return url.count('=')

def url_num_slashes(url):
    return url.count('/')

def url_num_dash(url):
    return url.count('-')

def url_num_semicolon(url):
    return url.count(';')

def url_num_at(url):
    return url.count('@')

def url_num_percent(url):
    return url.count('%')

def url_num_plus(url):
    return url.count('+')

def url_query_len(url):
    return len(urlparse(url).query)

def url_num_query_para(url):
    return len(urlparse(url).query.split('&')) if urlparse(url).query else 0

def url_ip_present(url):
    try:
        host = urlparse(url).netloc
        socket.inet_aton(host)
        return 1
    except:
        return 0

def url_entropy(url):
    prob = [n / len(url) for n in Counter(url).values()]
    return -sum(p * math.log2(p) for p in prob)

def url_count_consonants(url):
    consonants = set("bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ")
    return sum(1 for c in url if c in consonants)

def url_num_digits(url):
    return sum(c.isdigit() for c in url)

def url_chinese_present(url):
    """URL에 중국어(한자) 문자가 포함되어 있는지 확인"""
    for char in url:
        if '\u4e00' <= char <= '\u9fff':  # 한자 유니코드 범위
            return 1
    return 0

def url_port(url):
    return urlparse(url).port or 80  # 기본 포트 80


✅ 2. HTML 태그 관련 특성 추출

In [38]:
from bs4 import BeautifulSoup

def html_num_tags(html, tag):
    soup = BeautifulSoup(html, 'html.parser')
    return len(soup.find_all(tag))


In [39]:
def extract_url_features(url):
    return {
        "url_len": url_len(url),
        "url_num_hyphens_dom": url_num_hyphens_dom(url),
        "url_num_dom_token": url_num_dom_token(url),
        "url_path_len": url_path_len(url),
        "url_filename_len": url_filename_len(url),
        "url_longest_dom_token_len": url_longest_dom_token_len(url),
        "url_average_dom_token_len": url_average_dom_token_len(url),
        "url_tld": url_tld(url),
        "url_domain_len": url_domain_len(url),
        "url_hostname_len": url_hostname_len(url),
        "url_num_dots": url_num_dots(url),
        "url_num_underscores": url_num_underscores(url),
        "url_num_equals": url_num_equals(url),
        "url_num_slashes": url_num_slashes(url),
        "url_num_dash": url_num_dash(url),
        "url_num_semicolon": url_num_semicolon(url),
        "url_num_at": url_num_at(url),
        "url_num_percent": url_num_percent(url),
        "url_num_plus": url_num_plus(url),
        "url_query_len": url_query_len(url),
        "url_num_query_para": url_num_query_para(url),
        "url_ip_present": url_ip_present(url),
        "url_entropy": url_entropy(url),
        "url_count_consonants": url_count_consonants(url),
        "url_num_digits": url_num_digits(url),
        "url_chinese_present": url_chinese_present(url),
        "url_port": url_port(url)
    }


✅ HTML 태그 특성 추출 함수

In [49]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

tqdm.pandas()  # tqdm 설정 (진행상황 보기)

# 📌 1. HTML 태그 특성 추출 함수
def extract_html_features(html):
    soup = BeautifulSoup(str(html), 'html.parser')
    
    tags_to_count = [
        'iframe', 'script', 'embed', 'object', 'div', 'head', 'body',
        'form', 'a', 'small', 'span', 'input', 'applet', 'img', 'video', 'audio'
    ]
    
    features = {}
    for tag in tags_to_count:
        features[f"html_num_tags('{tag}')"] = len(soup.find_all(tag))
    
    return features

# 📌 2. CSV 파일 불러오기
df = pd.read_csv('Feature Website.csv',encoding='latin1')

# 컬럼명 확인
print("컬럼 목록:", df.columns.tolist())

# 📌 3. HTML 특성 추출
html_feature_df = df['html_code'].progress_apply(lambda x: pd.Series(extract_html_features(x)))

# 📌 4. label 컬럼 (정답)만 따로 떼어내기
label_col = 'repu'  # 실제 정답 컬럼 이름에 맞게 수정 필요
if label_col not in df.columns:
    raise ValueError(f"정답 컬럼명 '{label_col}'이 존재하지 않습니다. 실제 이름을 확인해 주세요.")

label_df = df[[label_col]]

# 📌 5. 최종 결과 결합
result_df = pd.concat([html_feature_df, label_df], axis=1)

# 📌 6. 저장
result_df.to_csv('Feature Website HTML Processed.csv', index=False)
print("✅ 처리 완료: Feature Website HTML Processed.csv 저장됨")




컬럼 목록: ['html_code', 'repu']


100%|██████████| 40/40 [00:00<00:00, 134.93it/s]

✅ 처리 완료: Feature Website HTML Processed.csv 저장됨





In [56]:
import pandas as pd

# 📌 1. 필요한 컬럼 목록 정의
required_columns = [
    'url_len', 'url_num_hyphens_dom', 'url_num_dom_token',
    'url_path_len', 'url_filename_len', 'url_longest_dom_token_len',
    'url_average_dom_token_len', 'url_tld', 'url_domain_len',
    'url_hostname_len', 'url_num_dots', 'url_num_underscores',
    'url_num_equals', 'url_num_slashes', 'url_num_dash',
    'url_num_semicolon', 'url_num_at', 'url_num_percent', 'url_num_plus',
    'url_query_len', 'url_num_query_para', 'url_ip_present', 'url_entropy',
    'url_count_consonants', 'url_num_digits', 'url_chinese_present',
    'url_port', "html_num_tags('iframe')", "html_num_tags('script')",
    "html_num_tags('embed')", "html_num_tags('object')",
    "html_num_tags('div')", "html_num_tags('head')",
    "html_num_tags('body')", "html_num_tags('form')", "html_num_tags('a')",
    "html_num_tags('small')", "html_num_tags('span')",
    "html_num_tags('input')", "html_num_tags('applet')",
    "html_num_tags('img')", "html_num_tags('video')",
    "html_num_tags('audio')"
]

# 📌 2. 파일 불러오기
df = pd.read_csv('Feature Website HTML Processed.csv')

# 'repu' 컬럼을 복사해 'Result_v1' 컬럼 생성 (복사이므로 'repu'는 그대로 유지)
df['Result_v1'] = df['repu']

# 'Result_v1' 컬럼을 맨 오른쪽으로 이동
col = df.pop('Result_v1')
df['Result_v1'] = col



# 📌 3. 없는 컬럼 찾아서 0으로 채워 넣기
for col in required_columns:
    if col not in df.columns:
        df[col] = 0
        print(f"⚠️ 누락된 컬럼 추가됨: {col}")

# 📌 4. 결과 저장 (선택사항)
# df.to_csv('Feature Website Final.csv', index=False)

# 📌 5. 확인
print("✅ 컬럼 채우기 완료. 총 컬럼 수:", len(df.columns))
df


⚠️ 누락된 컬럼 추가됨: url_len
⚠️ 누락된 컬럼 추가됨: url_num_hyphens_dom
⚠️ 누락된 컬럼 추가됨: url_num_dom_token
⚠️ 누락된 컬럼 추가됨: url_path_len
⚠️ 누락된 컬럼 추가됨: url_filename_len
⚠️ 누락된 컬럼 추가됨: url_longest_dom_token_len
⚠️ 누락된 컬럼 추가됨: url_average_dom_token_len
⚠️ 누락된 컬럼 추가됨: url_tld
⚠️ 누락된 컬럼 추가됨: url_domain_len
⚠️ 누락된 컬럼 추가됨: url_hostname_len
⚠️ 누락된 컬럼 추가됨: url_num_dots
⚠️ 누락된 컬럼 추가됨: url_num_underscores
⚠️ 누락된 컬럼 추가됨: url_num_equals
⚠️ 누락된 컬럼 추가됨: url_num_slashes
⚠️ 누락된 컬럼 추가됨: url_num_dash
⚠️ 누락된 컬럼 추가됨: url_num_semicolon
⚠️ 누락된 컬럼 추가됨: url_num_at
⚠️ 누락된 컬럼 추가됨: url_num_percent
⚠️ 누락된 컬럼 추가됨: url_num_plus
⚠️ 누락된 컬럼 추가됨: url_query_len
⚠️ 누락된 컬럼 추가됨: url_num_query_para
⚠️ 누락된 컬럼 추가됨: url_ip_present
⚠️ 누락된 컬럼 추가됨: url_entropy
⚠️ 누락된 컬럼 추가됨: url_count_consonants
⚠️ 누락된 컬럼 추가됨: url_num_digits
⚠️ 누락된 컬럼 추가됨: url_chinese_present
⚠️ 누락된 컬럼 추가됨: url_port
✅ 컬럼 채우기 완료. 총 컬럼 수: 45


Unnamed: 0,html_num_tags('iframe'),html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('small'),...,url_num_percent,url_num_plus,url_query_len,url_num_query_para,url_ip_present,url_entropy,url_count_consonants,url_num_digits,url_chinese_present,url_port
0,0,3,0,0,22,1,1,1,3,0,...,0,0,0,0,0,0,0,0,0,0
1,0,5,0,0,48,1,1,1,31,0,...,0,0,0,0,0,0,0,0,0,0
2,0,20,0,0,19,1,1,0,15,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,23,1,1,2,35,0,...,0,0,0,0,0,0,0,0,0,0
5,0,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,19,0,0,6,1,1,0,42,0,...,0,0,0,0,0,0,0,0,0,0
7,0,10,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,25,1,1,2,198,0,...,0,0,0,0,0,0,0,0,0,0
9,0,5,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
df.columns

Index(['html_num_tags('iframe')', 'html_num_tags('script')',
       'html_num_tags('embed')', 'html_num_tags('object')',
       'html_num_tags('div')', 'html_num_tags('head')',
       'html_num_tags('body')', 'html_num_tags('form')', 'html_num_tags('a')',
       'html_num_tags('small')', 'html_num_tags('span')',
       'html_num_tags('input')', 'html_num_tags('applet')',
       'html_num_tags('img')', 'html_num_tags('video')',
       'html_num_tags('audio')', 'Result_v1', 'url_len', 'url_num_hyphens_dom',
       'url_num_dom_token', 'url_path_len', 'url_filename_len',
       'url_longest_dom_token_len', 'url_average_dom_token_len', 'url_tld',
       'url_domain_len', 'url_hostname_len', 'url_num_dots',
       'url_num_underscores', 'url_num_equals', 'url_num_slashes',
       'url_num_dash', 'url_num_semicolon', 'url_num_at', 'url_num_percent',
       'url_num_plus', 'url_query_len', 'url_num_query_para', 'url_ip_present',
       'url_entropy', 'url_count_consonants', 'url_num_digits',
