In [51]:
# BUILD_LIB=1 pip3 install ssdeep

import pandas as pd
import numpy as np
import tldextract

df = pd.read_csv('domains.txt', sep=' ', header=0, names=['domain'])

In [52]:
df['extracted'] = df['domain'].apply(lambda x: tldextract.extract(x))

df['suffix'] = df['extracted'].apply(lambda x: x.registered_domain)
df['suffix'].replace('', np.nan, inplace=True)
df.dropna(subset=['suffix'], inplace=True)

df['tld'] = df['extracted'].apply(lambda x: x.suffix)
df.drop(columns=['extracted'], inplace=True)

df.head(20)

Unnamed: 0,domain,suffix,tld
0,0-courier.push.apple.com,apple.com,com
1,0-courier.sandbox.push.apple.com,apple.com,com
2,0-courier2.push.apple.com,apple.com,com
3,0-eu-west-1-awesomeads-166321764.eu-west-1.elb...,amazonaws.com,com
4,0-gravatar-com.cdn.ampproject.org,ampproject.org,org
5,0-hubs.iosdm.net,iosdm.net,net
6,0-i2--prod-birminghammail-co-uk-0.cdn.ampproje...,ampproject.org,org
7,0-i2--prod-chroniclelive-co-uk-0.cdn.ampprojec...,ampproject.org,org
8,0-i2--prod-dailyrecord-co-uk-0.cdn.ampproject.org,ampproject.org,org
9,0-i2--prod-dailystar-co-uk-0.cdn.ampproject.org,ampproject.org,org


In [53]:
len(df)

51219

In [54]:
cutoff = 50
drop_ratio = 0.9
max_per_suffix=1000
final_take = 1000000

In [55]:
top_suffixes = df['suffix'].value_counts().nlargest(cutoff).index

for suf in top_suffixes:
    num_rows = len(df[df['suffix'] == suf])
    n_samples = int(num_rows * drop_ratio)
    # Only sample if there are enough rows to sample
    if n_samples > 0 and n_samples < num_rows:
        df.drop(df[df['suffix'] == suf].sample(n=n_samples).index, inplace=True)


In [56]:
len(df)

37548

In [57]:
df['suffix'].value_counts().nlargest(10)

suffix
casalemedia.com      371
doubleclick.net      171
akamai.net           129
amazonaws.com         97
sharepoint.com        70
amazon.com            65
akstat.io             64
demdex.net            64
zdusercontent.com     60
webex.com             59
Name: count, dtype: int64

In [58]:
def sample(group, maxr=100000):
    if len(group) > maxr:
        return group.sample(n=maxr, random_state=42)
    else:
        return group
    
grouped_df = df.groupby('suffix', group_keys=False).apply(sample, maxr=max_per_suffix).reset_index(drop=True)

grouped_df.head(50)

Unnamed: 0,domain,suffix,tld
0,0-hubs.iosdm.net,iosdm.net,net
1,0-i2--prod-birminghammail-co-uk-0.cdn.ampproje...,ampproject.org,org
2,0-s3--ap--southeast--1-amazonaws-com-0.cdn.amp...,ampproject.org,org
3,0-ukwest1-pushp.svc.ms,svc.ms,ms
4,0.1.cn.akamaiedge.net,akamaiedge.net,net
5,0.1.cn.akamaitech.net,akamaitech.net,net
6,0.academia-photos.com,academia-photos.com,com
7,0.allegroimg.com,allegroimg.com,com
8,0.azuredatabricks.net,azuredatabricks.net,net
9,0.bdrthermea.pool.ntp.org,ntp.org,org


In [59]:
# grouped_df['suffix'].value_counts().nlargest(20)

In [60]:
cnt = len(grouped_df)
cnt_str = round(cnt / 1000000, 1)

In [61]:
grouped_df.to_csv(f"domains_undersampled_{cutoff}_{max_per_suffix}_{cnt_str}M.txt", sep=' ')

In [62]:
# Assuming 'final_take' is the desired sample size.
# Ensure 'final_take' does not exceed the number of rows in 'grouped_df'.
final_take = min(final_take, len(grouped_df))

# Convert 'final_take' to a string representing millions for the filename.
final_take_str = str(round(final_take / 1_000_000, 1)) + 'M'

# Now 'final_take_str' can be used to create the filename.
sampled = grouped_df.sample(n=final_take, random_state=42)
sampled.to_csv(f"domains_undersampled_{cutoff}_{max_per_suffix}_{final_take_str}.txt", sep=' ')
