# 1. 初始化
## 1.1 导入包

In [None]:
import csv
import json
import random
from datetime import date, datetime
import datetime
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


## 1.2 读入文件

存放在 list 中

In [None]:
l_fqdn = []
with open('fqdn.csv', 'r', encoding='utf-8') as f:
    f_fqdn = csv.reader(f)
    headers = next(f_fqdn) # 跳过第一行
    for row in f_fqdn:
        l_fqdn.append(row)

l_ip = {}
with open('ip.csv', 'r', encoding='utf-8') as f:
    f_ip = csv.reader(f)
    headers = next(f_ip)
    for row in f_ip:
        l_ip[row[0]] = row[1:]

# l_ipv6 = []
with open('ipv6.csv', 'r', encoding='utf-8') as f:
    f_ipv6 = csv.reader(f)
    headers = next(f_ipv6)
    for row in f_ipv6:
        # l_ipv6[row[0]] = row[1:]
        l_ip[row[0]] = row[1:]

l_access = []
with open('access.csv', 'r', encoding='utf-8') as f:
    f_access = csv.reader(f)
    headers = next(f_access)
    for row in f_access:
        l_access.append(row)

l_flint = []
with open('flint.csv', 'r', encoding='utf-8') as f:
    f_flint = csv.reader(f)
    headers = next(f_flint)
    for row in f_flint:
        l_flint.append(row)

l_label = []
with open('label.csv', 'r', encoding='utf-8') as f:
    f_label = csv.reader(f)
    headers = next(f_label)
    for row in f_label:
        l_label.append(row)

l_whois = []
with open('whois.json', 'r', encoding='utf-8') as f:
    f_whois = json.load(f)
    for row in f_whois:
        l_whois.append(row)


# 2 特征提取

## 2.1 access

将 access 与 ip 一并处理。选取的特征有

- 访问次数
- 连续访问次数
- 访问 IP 数
- 访问国家数
- 访问城市数
- 访问 ISP 数
- 按小时统计访问次数
- 按日期统计访问次数

In [None]:
# Initialization

tot_fqdn = 20512 # fqdn 编号总数
'''
由于需要直接访问下标，所以需要初始化一个空列表
'''
l_accesscount = [0] * tot_fqdn # 访问次数
l_multiaccesscount = [0] * tot_fqdn # 多次访问次数
# l_multiaccessip = [set() for _ in range(tot_fqdn)] # 多次访问IP
l_accesstime = [[0] * 24 for _ in range(tot_fqdn)] # 按小时统计访问次数
l_accessdate = [[0] * 92 for _ in range(tot_fqdn)] # 按日期统计访问次数
l_accessip = [set() for _ in range(tot_fqdn)] # 访问IP
l_ipcountry = [set() for _ in range(tot_fqdn)] # 访问IP所在国家
l_ipcity = [set() for _ in range(tot_fqdn)] # 访问IP所在城市
l_ipisp = [set() for _ in range(tot_fqdn)] # 访问IP ISP

date2 = date(2020, 3, 1) # 开始日期

# count

for access in l_access:
    num = int(access[0][5:]) # 域名编号
    access_ip = access[1] # 访问IP
    access_count = int(access[2]) # 访问次数
    access_time = access[3] # 访问时间

    l_accesscount[num] += access_count # 编号为 num 的域名的访问次数
    if access_count > 1:
        l_multiaccesscount[num] += access_count  # 编号为 num 的域名的连续访问次数
        # l_multiaccessip[num].add(access_ip) # 编号为 num 的域名的连续访问 IP
    
    l_accesstime[num][int(access_time[8:10])] += access_count # 编号为 num 的域名按小时统计时间段的访问次数

    date1 = datetime.datetime.strptime(access_time[:8], '%Y%m%d').date()
    l_accessdate[num][(date1 - date2).days] += access_count  # 编号为 num 的域名按日期统计访问次数

    match_ip = l_ip[access_ip]
    l_accessip[num].add(access_ip)  # 编号为 num 的域名的访问 IP 列表
    l_ipcountry[num].add(match_ip[0])  # 编号为 num 的域名的访问 IP 地址的国家列表
    l_ipcity[num].add(match_ip[2])  # 编号为 num 的域名的访问 IP 地址的城市列表
    l_ipisp[num].add(match_ip[5])  # 编号为 num 的域名的访问 IP 地址的 ISP 列表


## 2.2 flint

选取的特征有

- 解析次数
- 按日期统计解析次数

In [None]:
l_notip = [] # 解析结果不是 IP 的域名

tot_fqdn = 20512 # fqdn 编号总数
l_flintcount = [0] * tot_fqdn # 解析次数
l_flintdate = [[0] * 92 for _ in range(tot_fqdn)] # 按日期统计解析次数

for flint in l_flint:
    num = int(flint[0][5:])  # 域名编号
    typenum = int(flint[1])  # 域名类型
    if flint[2][0:4] == 'fqdn':
        refernum = int(flint[2][5:]) # 域名引用编号
        count = int(flint[3]) # 域名引用访问次数
        date = datetime.datetime.strptime(flint[4], '%Y%m%d').date() # 域名引用访问时间
        l_notip.append([num, typenum, refernum, count, date])
    else:
        ip = flint[2]  # 域名 IP
        count = int(flint[3])  # 域名访问次数
        date = datetime.datetime.strptime(flint[4], '%Y%m%d').date()  # 域名访问时间
        
    
    l_flintcount[num] += count # 编号为 num 的域名的解析次数
    l_flintdate[num][(date - date2).days] += count # 编号为 num 的域名按日期统计解析次数


## 2.3 whois

选取的特征有

- 域名创建日期
- 域名过期日期
- 域名更新次数
- 域名 DNS 服务器数
- 域名管理员邮箱数
- 域名注册国家数
- 域名注册邮箱数
- 域名注册省份数
- 域名注册邮箱数
- 域名 DNS 服务器列表数
- 域名 DNS 服务器数
- 域名注册商数

In [None]:
l_whoiscreatedate = [10**6] * tot_fqdn # 域名创建日期
l_whoisexpiredate = [0] * tot_fqdn # 域名过期日期
l_whoisupdatedate = [set() for _ in range(tot_fqdn)]  # 域名更新日期

l_whoisnameserver = [set() for _ in range(tot_fqdn)]
l_whoisadminemail = [set() for _ in range(tot_fqdn)]
l_whoisregistercountry = [set() for _ in range(tot_fqdn)]
l_whoisregisteremail = [set() for _ in range(tot_fqdn)]
l_whoisregisterstate = [set() for _ in range(tot_fqdn)]
l_whoistechemial = [set() for _ in range(tot_fqdn)]
l_whoisserverlist = [set() for _ in range(tot_fqdn)]
l_whoisserver = [set() for _ in range(tot_fqdn)]
l_whoissponsor = [set() for _ in range(tot_fqdn)]

for whois in l_whois:
    num = int(whois['fqdn_no'][5:]) # 域名编号
    if whois['createddate'] != None:
        l_whoiscreatedate[num] = min(int(whois['createddate']/86400000), l_whoiscreatedate[num])  # 域名创建日期
    if whois['expiresdate'] != None:
        l_whoisexpiredate[num] = max(int(whois['expiresdate']/86400000), l_whoisexpiredate[num])  # 域名过期日期
    if whois['updateddate'] != None:
        l_whoisupdatedate[num].add(int(whois['updateddate']))  # 域名更新日期
    if whois['nameservers'] != None:
        l_whoisnameserver[num].update(whois['nameservers'])
    if whois['admin_email'] != None:
        l_whoisadminemail[num].update(whois['admin_email'])  # 域名的管理员邮箱
    if whois['registrant_country'] != None:
        l_whoisregistercountry[num].update(whois['registrant_country'])  # 域名的注册国家
    if whois['registrant_email'] != None:
        l_whoisregisteremail[num].update(whois['registrant_email'])  # 域名的注册邮箱
    if whois['registrant_state'] != None:
        l_whoisregisterstate[num].update(whois['registrant_state'])  # 域名的注册省份
    if whois['tech_email'] != None:
        l_whoistechemial[num].update(whois['tech_email'])  # 域名的注册邮箱
    if whois['r_whoisserver_list'] != None:
        l_whoisserverlist[num].update(whois['r_whoisserver_list'])  # 域名的 DNS 服务器列表
    if whois['whoisserver'] != None:
        l_whoisserver[num].update(whois['whoisserver'])  # 域名的 DNS 服务器
    if whois['sponsoring'] != None:
        l_whoissponsor[num].update(whois['sponsoring'])  # 域名的注册商
    


## 2.4 label

提取标签，并为拆分训练集、测试集做准备。

In [None]:
Label_all = [-1] * tot_fqdn # 域名标签

for label in l_label:
    num = int(label[0][5:])  # 域名编号
    Label_all[num] = int(label[1])  # 域名标签

## 2.5 fqdn

进一步处理 2.1 - 2.4 得到的内容。

提取 fqdn 中的特征：

- 字符个数
- 数字个数
- 普通字符个数
- 特殊字符个数
- 单词字母个数
- 深度

并将所有内容整合。

In [None]:
Feature_all = []  # 合并信息

for fqdn in l_fqdn:
    url = fqdn[0] # 域名
    num = int(fqdn[1][5:]) # 域名编号

    n_character = 0 # 字符个数
    n_digit = 0 # 数字个数
    n_normal = 0 # 普通字符个数
    n_special = 0 # 特殊字符个数
    n_alpha = 0 # 单词字母个数
    n_depth = 0 # 深度
    
    n_access = l_accesscount[num] # 访问次数
    n_multiaccess = l_multiaccesscount[num] # 连续访问次数
    n_access_ip = len(l_accessip[num]) # 访问 IP 数
    n_access_country = len(l_ipcountry[num]) # 访问国家数
    n_access_city = len(l_ipcity[num]) # 访问城市数
    n_access_isp = len(l_ipisp[num]) # 访问 ISP 数
    n_access_byhour = l_accesstime[num]  # 按小时统计访问次数
    n_access_bydate = l_accessdate[num]  # 按日期统计访问次数

    n_flint = l_flintcount[num] # 解析次数
    n_flint_bydate = l_flintdate[num] # 按日期统计解析次数

    n_whois_create = l_whoiscreatedate[num] # 域名创建日期
    n_whois_expire = l_whoisexpiredate[num] # 域名过期日期
    n_whois_update = len(l_whoisupdatedate[num]) # 域名更新次数
    n_whois_nameserver = len(l_whoisnameserver[num]) # 域名 DNS 服务器数
    n_whois_adminemail = len(l_whoisadminemail[num]) # 域名管理员邮箱数
    n_whois_registercountry = len(l_whoisregistercountry[num]) # 域名注册国家数
    n_whois_registeremail = len(l_whoisregisteremail[num]) # 域名注册邮箱数
    n_whois_registerstate = len(l_whoisregisterstate[num]) # 域名注册省份数
    n_whois_techemial = len(l_whoistechemial[num]) # 域名注册邮箱数
    n_whois_serverlist = len(l_whoisserverlist[num]) # 域名 DNS 服务器列表数
    n_whois_server = len(l_whoisserver[num]) # 域名 DNS 服务器数
    n_whois_sponsor = len(l_whoissponsor[num]) # 域名注册商数

    is_normal_suffix = 0 # 是否是普通后缀
    commonRootList = ['cn', 'com', 'net', 'org', 'gov', 'info', 'edu']
    if url.split('.')[-1] in commonRootList:
        is_normal_suffix = 1

    is_alpha = False # 是否为单词字母

    for i in range(len(url)):
        if url[i] == '[':
            is_alpha = True
        elif url[i] == ']':
            is_alpha = False
        else:
            n_character += 1
            
            if url[i].isalnum():
                n_normal += 1
            else:
                n_special += 1

            if url[i].isdigit():
                n_digit += 1

            if is_alpha:
                n_alpha += 1
                
            if url[i] == '.':
                n_depth += 1
    
    tmp = [n_character, n_digit, n_normal, n_special, n_alpha, n_depth, is_normal_suffix,
           n_access, n_multiaccess, n_access_ip, n_access_country, n_access_city, n_access_isp,
           n_flint,
           n_whois_create, n_whois_expire, n_whois_update, n_whois_nameserver, n_whois_adminemail, n_whois_registercountry, n_whois_registeremail, n_whois_registerstate, n_whois_techemial, n_whois_serverlist, n_whois_server, n_whois_sponsor]
    tmp.extend(n_access_byhour) # 拆分列表
    tmp.extend(n_access_bydate)
    tmp.extend(n_flint_bydate)
    Feature_all.append(tmp)


# 3 训练与预测

首先，把没有给 label 的都当作非恶意域名。

然后选取所有的恶意域名、及约1500个非恶意域名做训练集；所有没有label的做测试集。

区分是否为恶意域名。

工具直接使用 xgboost。

In [None]:
test_num_binary = []  # 恶意域名测试集编号
test_feature_binary = []  # 恶意域名测试集特征
test_label_binary = []  # 恶意域名测试集标签

train_random_binary = [] # 随机域名训练集编号

for i in range(tot_fqdn):
    if Label_all[i] == -1:
        '''
        train_random_binary.append([i, Feature_all[i], 0])
        '''
        test_num_binary.append(i)
        test_feature_binary.append(Feature_all[i])

test_label_tmp = [0] * len(test_num_binary)

tot_round = 500

for itera in range(tot_round):
    train_feature_binary = []  # 恶意域名训练集特征
    train_label_binary = []  # 恶意域名训练集标签
    train_random_binary = random.sample(test_feature_binary, 1600)
    for i in range(tot_fqdn):
        if Label_all[i] != -1:
            train_feature_binary.append(Feature_all[i])
            train_label_binary.append(1)
    for i in range(len(train_random_binary)):
        train_feature_binary.append(train_random_binary[i])
        train_label_binary.append(0)

    params = { # 玄学调参
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'gamma': 0.1,
        'max_depth': 6,
        'lambda': 2,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 1,
        'eta': 0.05,
        'eval_metric': 'logloss'
    }

    # 构造训练集
    dtrain = xgb.DMatrix(train_feature_binary, train_label_binary)
    num_rounds = 500
    # xgboost模型训练
    model = xgb.train(params, dtrain, num_rounds)

    # 对测试集进行预测
    dtest = xgb.DMatrix(test_feature_binary)
    test_label_binary_float = model.predict(dtest)

    for i in range(len(test_label_binary_float)):
        test_label_tmp[i] += round(test_label_binary_float[i])
    
    print(itera, end='...')
        
for i in range(len(test_label_tmp)):
    test_label_tmp[i] = test_label_tmp[i] / tot_round

test_num = []
test_feature = []

# 将恶意域名单独列出来再细分
test_label_binary = [0] * len(test_label_tmp)
for i in range(len(test_label_tmp)):
    test_label_binary[i] = round(test_label_tmp[i])
    if test_label_binary[i] == 1:
        test_num.append(test_num_binary[i])
        test_feature.append(test_feature_binary[i])




In [None]:
with open("guilty.csv", "w") as csvfile:
    csvfile.write("fqdn_no,prob\n")
    for i in range(len(test_label_tmp)):
        csvfile.write(
            "fqdn_" + str(test_num_binary[i]) + "," + str(test_label_tmp[i]) + "\n")


看看找到了几个恶意域名

In [98]:
print(len(test_num))

542


区分恶意域名种类

In [None]:
train_num = []
train_feature = []
train_label = []

test_label = []

for i in range(len(l_fqdn)):
    if Label_all[i] != -1:
        train_num.append(i)
        train_feature.append(Feature_all[i])
        train_label.append(Label_all[i])

params = {  # 玄学调参
    'booster': 'gbtree',
    'objective': 'multi:softmax',
    'num_class': 9,
    'gamma': 0.0566,
    'max_depth': 4,
    'lambda': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 2.6,
    'eta': 0.092
}

# 构造训练集
dtrain = xgb.DMatrix(train_feature, train_label)
num_rounds = 650
# xgboost模型训练
model = xgb.train(params, dtrain, num_rounds)

# 对测试集进行预测
dtest = xgb.DMatrix(test_feature)
test_label = model.predict(dtest)


输出结果

In [None]:
# 输出结果
with open("result.csv", "w") as csvfile:
    csvfile.write("fqdn_no,family_no\n")
    for row in l_label:
        csvfile.write(row[0] + "," + row[1] + "\n")
    for i in range(len(test_num)):
        csvfile.write("fqdn_" + str(test_num[i]) + "," + str(test_label[i]) + "\n")


In [101]:
train_count = [0] * 9
train_percent = [0] * 9
for key in train_label:
  train_count[int(key)] += 1
for i in range(9):
  train_percent[i] = round(train_count[i] / len(train_label), 3)


result_count = [0] * 9
result_percent = [0] * 9
for key in test_label:
  result_count[int(key)] += 1
for i in range(9):
  result_percent[i] = round(result_count[i] / len(test_label), 3)

expected_count = [0] * 9
diff_count = [0] * 9
for i in range(9):
  expected_count[i] = round((train_count[i] / len(train_label)) * len(test_label))
  diff_count[i] = abs(result_count[i] - expected_count[i])

print("训练集总数：")
print(len(train_label))
print("结果总数：")
print(len(test_label))

print("训练集类别数量：")
print(train_count)
print("期望结果数量：")
print(expected_count)
print("结果类别数量：")
print(result_count)
print("与期望差异值：")
print(diff_count)

print("训练集类别占比：")
print(train_percent)
print("结果类别占比：")
print(result_percent)


训练集总数：
476
结果总数：
542
训练集类别数量：
[336, 6, 18, 17, 24, 8, 57, 6, 4]
期望结果数量：
[383, 7, 20, 19, 27, 9, 65, 7, 5]
结果类别数量：
[391, 7, 15, 14, 14, 12, 83, 4, 2]
与期望差异值：
[8, 0, 5, 5, 13, 3, 18, 3, 3]
训练集类别占比：
[0.706, 0.013, 0.038, 0.036, 0.05, 0.017, 0.12, 0.013, 0.008]
结果类别占比：
[0.721, 0.013, 0.028, 0.026, 0.026, 0.022, 0.153, 0.007, 0.004]
