In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/merged_output.csv', nrows=10_000)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,url,type,original_url,url_type,url_len,pri_domain,letters_count,digits_count,special_chars_count,...,root_domain,original_url.1,final_url,redirect_chain,redirect_count,head_elements,timeout,has_meta_refresh,meta_refresh_url,duration_sec
0,0,br-icloud.com.br,phishing,br-icloud.com.br,2,16,br-icloud.com.br,13,0,3,...,br-icloud,http://br-icloud.com.br,http://br-icloud.com.br/,,0,,False,False,,4.55
1,1,mp3raid.com/music/krizz_kaliko.html,benign,mp3raid.com/music/krizz_kaliko.html,0,35,mp3raid.com,29,1,5,...,mp3raid,http://mp3raid.com/music/krizz_kaliko.html,http://mp3raid.com/music/krizz_kaliko.html,,0,,False,False,,2.78
2,2,bopsecrets.org/rexroth/cr/1.htm,benign,bopsecrets.org/rexroth/cr/1.htm,0,31,bopsecrets.org,25,1,5,...,bopsecrets,http://bopsecrets.org/rexroth/cr/1.htm,https://bopsecrets.org/rexroth/cr/1.htm,"[{""url"": ""http://bopsecrets.org/rexroth/cr/1.h...",1,"[{'tag': 'meta', 'attributes': {'http-equiv': ...",False,False,,6.41
3,3,http://garage-pirenne.be/index.php?option=com_...,defacement,http://www.garage-pirenne.be/index.php?option=...,1,77,garage-pirenne.be,60,7,17,...,garage-pirenne,http://www.garage-pirenne.be/index.php?option=...,http://www.garage-pirenne.be/index.php?option=...,,0,,False,False,,4.71
4,4,http://adventure-nicaragua.net/index.php?optio...,defacement,http://adventure-nicaragua.net/index.php?optio...,1,228,adventure-nicaragua.net,199,22,14,...,adventure-nicaragua,http://adventure-nicaragua.net/index.php?optio...,http://adventure-nicaragua.net/index.php?optio...,,0,,False,True,http://adventure-nicaragua.net/index.php?optio...,4.51


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           10000 non-null  int64  
 1   url                  10000 non-null  object 
 2   type                 10000 non-null  object 
 3   original_url         10000 non-null  object 
 4   url_type             10000 non-null  int64  
 5   url_len              10000 non-null  int64  
 6   pri_domain           10000 non-null  object 
 7   letters_count        10000 non-null  int64  
 8   digits_count         10000 non-null  int64  
 9   special_chars_count  10000 non-null  int64  
 10  shortened            10000 non-null  int64  
 11  is_domain_in_url     10000 non-null  int64  
 12  secure_http          10000 non-null  int64  
 13  have_ip              10000 non-null  int64  
 14  url_region           10000 non-null  object 
 15  root_domain          9995 non-null   

In [5]:
df['type'].unique()

array(['phishing', 'benign', 'defacement', 'malware'], dtype=object)

In [6]:
# 目標件数
target_count = 10_000

# ラベルごとのデータ保存用
collected = {
    'phishing': [],
    'benign': [],
    'defacement': [],
    'malware': []
}

# 各ラベルのカウント
counts = {key: 0 for key in collected}

# chunkごとに読み込む
chunksize = 10_000
for chunk in pd.read_csv('../data/merged_output.csv', chunksize=chunksize):
    for label in collected:
        if counts[label] < target_count:
            # このラベルのデータをフィルタリング
            subset = chunk[chunk['type'] == label]
            remaining = target_count - counts[label]
            if not subset.empty:
                # ランダムにサンプルを抽出
                sampled = subset.sample(n=min(remaining, len(subset)), random_state=42)
                collected[label].append(sampled)
                counts[label] += len(sampled)

    # すべて収集できたら終了
    if all(count >= target_count for count in counts.values()):
        break

# 最終的にまとめる
balanced_df = pd.concat([pd.concat(collected[label]) for label in collected], ignore_index=True)

# 結果確認
print(balanced_df['type'].value_counts())


type
phishing      10000
benign        10000
defacement    10000
malware       10000
Name: count, dtype: int64


In [10]:
balanced_df.to_csv('../data/balanced_random_40k_test2.csv', index=False)

In [8]:
balanced_df.head()

Unnamed: 0.1,Unnamed: 0,url,type,original_url,url_type,url_len,pri_domain,letters_count,digits_count,special_chars_count,...,root_domain,original_url.1,final_url,redirect_chain,redirect_count,head_elements,timeout,has_meta_refresh,meta_refresh_url,duration_sec
0,1591,smolmaw5.beget.tech,phishing,smolmaw5.beget.tech,2,19,smolmaw5.beget.tech,16,1,2,...,beget,http://smolmaw5.beget.tech,http://smolmaw5.beget.tech/,,0,"[{'tag': 'meta', 'attributes': {'name': 'viewp...",False,False,,6.67
1,3772,helps-instagram.com,phishing,helps-instagram.com,2,19,helps-instagram.com,17,0,2,...,helps-instagram,http://helps-instagram.com,,,0,,False,False,,3.44
2,1007,contatocliente-sac.com.br,phishing,contatocliente-sac.com.br,2,25,contatocliente-sac.com.br,22,0,3,...,contatocliente-sac,http://contatocliente-sac.com.br,,,0,,False,False,,4.27
3,9761,domowe.star-kom.eu,phishing,domowe.star-kom.eu,2,18,domowe.star-kom.eu,15,0,3,...,star-kom,http://domowe.star-kom.eu,,,0,,False,False,,2.15
4,4559,yellow-directory-canada.com,phishing,yellow-directory-canada.com,2,27,yellow-directory-canada.com,24,0,3,...,yellow-directory-canada,http://yellow-directory-canada.com,,,0,,False,False,,2.43
