# 1. 初始化
## 1.1 导入包

In [None]:
import csv
import json
import random
from datetime import date, datetime
import datetime
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sklearn
import statistics


## 1.2 读入文件

存放在 list 中

In [None]:
TRAIN_l_fqdn = []
with open('fqdn.csv', 'r', encoding='utf-8') as f:
    f_fqdn = csv.reader(f)
    headers = next(f_fqdn) # 跳过第一行
    for row in f_fqdn:
        TRAIN_l_fqdn.append(row)

TRAIN_l_ip = {}
with open('ip.csv', 'r', encoding='utf-8') as f:
    f_ip = csv.reader(f)
    headers = next(f_ip)
    for row in f_ip:
        TRAIN_l_ip[row[0]] = row[1:]

# TRAIN_l_ipv6 = []
with open('ipv6.csv', 'r', encoding='utf-8') as f:
    f_ipv6 = csv.reader(f)
    headers = next(f_ipv6)
    for row in f_ipv6:
        # TRAIN_l_ipv6[row[0]] = row[1:]
        TRAIN_l_ip[row[0]] = row[1:]

TRAIN_l_access = []
with open('access.csv', 'r', encoding='utf-8') as f:
    f_access = csv.reader(f)
    headers = next(f_access)
    for row in f_access:
        TRAIN_l_access.append(row)

TRAIN_l_flint = []
with open('flint.csv', 'r', encoding='utf-8') as f:
    f_flint = csv.reader(f)
    headers = next(f_flint)
    for row in f_flint:
        TRAIN_l_flint.append(row)


# 2 特征提取

## 2.1 access

将 access 与 ip 一并处理。选取的特征有

- 访问次数
- 连续访问次数
- 访问 IP 数
- 访问国家数
- 访问城市数
- 访问 ISP 数
- 按小时统计访问次数
- 按日期统计访问次数

In [None]:
# Initialization

tot_fqdn = 17468  # fqdn 编号总数

l_accesscount = [0] * tot_fqdn  # 访问次数
l_multiaccesscount = [0] * tot_fqdn  # 多次访问次数
l_multiaccessip = [set() for _ in range(tot_fqdn)] # 多次访问IP
l_accesstime = [[0] * 24 for _ in range(tot_fqdn)]  # 按小时统计访问次数
l_accessdate = [[0] * 92 for _ in range(tot_fqdn)]  # 按日期统计访问次数
l_accessip = [set() for _ in range(tot_fqdn)]  # 访问IP
l_ipcountry = [set() for _ in range(tot_fqdn)]  # 访问IP所在国家
l_ipcity = [set() for _ in range(tot_fqdn)]  # 访问IP所在城市
l_ipisp = [set() for _ in range(tot_fqdn)]  # 访问IP ISP

date2 = date(2020, 3, 1)  # 开始日期

# count

for access in TRAIN_l_access:
    num = int(access[0][5:])  # 域名编号
    access_ip = access[1]  # 访问IP
    access_count = int(access[2])  # 访问次数
    access_date = access[4]  # 访问日期
    access_hour = int(access[5])  # 访问时间

    l_accesscount[num] += access_count  # 编号为 num 的域名的访问次数
    if access_count > 1:
        l_multiaccesscount[num] += access_count  # 编号为 num 的域名的连续访问次数
        l_multiaccessip[num].add(access_ip) # 编号为 num 的域名的连续访问 IP

    l_accesstime[num][access_hour] += access_count  # 编号为 num 的域名按小时统计时间段的访问次数

    date1 = datetime.datetime.strptime(access_date, '%Y%m%d').date()
    # 编号为 num 的域名按日期统计访问次数
    l_accessdate[num][(date1 - date2).days] += access_count

    match_ip = TRAIN_l_ip[access_ip]
    l_accessip[num].add(access_ip)  # 编号为 num 的域名的访问 IP 列表
    l_ipcountry[num].add(match_ip[0])  # 编号为 num 的域名的访问 IP 地址的国家列表
    l_ipcity[num].add(match_ip[2])  # 编号为 num 的域名的访问 IP 地址的城市列表
    l_ipisp[num].add(match_ip[5])  # 编号为 num 的域名的访问 IP 地址的 ISP 列表


## 2.2 flint

选取的特征有

- 解析次数
- 按日期统计解析次数

In [None]:
l_notip = []  # 解析结果不是 IP 的域名

l_flintcount = [0] * tot_fqdn  # 解析次数
l_flintttl = [0] * tot_fqdn  # 解析 TTL
l_flintdate = [[0] * 92 for _ in range(tot_fqdn)]  # 按日期统计解析次数

for flint in TRAIN_l_flint:
    num = int(flint[0][5:])  # 域名编号
    typenum = int(flint[1])  # 域名类型
    if flint[2][0:4] == 'fqdn':
        refernum = int(flint[2][5:])  # 域名引用编号
        count = int(flint[3])  # 域名引用访问次数
        if flint[4] != '':
            ttl = int(flint[4])  # 域名引用 TTL
        else:
            ttl = 0
        date = datetime.datetime.strptime(flint[5], '%Y%m%d').date()  # 域名引用访问时间
        l_notip.append([num, typenum, refernum, count, date])
    else:
        ip = flint[2]  # 域名 IP
        count = int(flint[3])  # 域名访问次数
        if flint[4] != '':
            ttl = int(flint[4])  # 域名引用 TTL
        else:
            ttl = 0
        date = datetime.datetime.strptime(flint[5], '%Y%m%d').date()  # 域名访问时间

    l_flintcount[num] += count  # 编号为 num 的域名的解析次数
    l_flintttl[num] += ttl  # 编号为 num 的域名的解析 TTL
    l_flintdate[num][(date - date2).days] += count  # 编号为 num 的域名按日期统计解析次数


## 2.5 fqdn

进一步处理 2.1 - 2.4 得到的内容。

提取 fqdn 中的特征：

- 字符个数
- 数字个数
- 普通字符个数
- 特殊字符个数
- 单词字母个数
- 深度

并将所有内容整合。

In [11]:
TRAIN_num = []  # 训练集域名编号
TRAIN_feature = []  # 合并信息
TRAIN_label = []  # 域名标签

for fqdn in TRAIN_l_fqdn:
    url = fqdn[0]  # 域名
    num = int(fqdn[1][5:])  # 域名编号

    n_character = 0  # 字符个数
    n_digit = 0  # 数字个数
    n_normal = 0  # 普通字符个数
    n_special = 0  # 特殊字符个数
    n_alpha = 0  # 单词字母个数
    n_depth = 0  # 深度

    n_access = l_accesscount[num]  # 访问次数
    n_multiaccess = l_multiaccesscount[num]  # 连续访问次数
    n_access_ip = len(l_accessip[num])  # 访问 IP 数
    n_access_country = len(l_ipcountry[num])  # 访问国家数
    n_access_city = len(l_ipcity[num])  # 访问城市数
    n_access_isp = len(l_ipisp[num])  # 访问 ISP 数
    n_access_byhour = l_accesstime[num]  # 按小时统计访问次数
    n_access_bydate = l_accessdate[num]  # 按日期统计访问次数

    n_flint = l_flintcount[num]  # 解析次数
    n_flint_bydate = l_flintdate[num]  # 按日期统计解析次数

    is_normal_suffix = 0  # 是否是普通后缀
    commonRootList = ['cn', 'com', 'net', 'org', 'gov', 'info', 'edu']
    if url.split('.')[-1] in commonRootList:
        is_normal_suffix = 1

    is_alpha = False  # 是否为单词字母

    for i in range(len(url)):
        if url[i] == '[':
            is_alpha = True
        elif url[i] == ']':
            is_alpha = False
        else:
            n_character += 1

            if url[i].isalnum():
                n_normal += 1
            else:
                n_special += 1

            if url[i].isdigit():
                n_digit += 1

            if is_alpha:
                n_alpha += 1

            if url[i] == '.':
                n_depth += 1

    tmp = [n_character, n_digit, n_normal, n_special, n_alpha, n_depth, is_normal_suffix,
           n_access, n_multiaccess, n_access_ip, n_access_country, n_access_city, n_access_isp,
           n_flint]

    tmp.append(min(n_access_byhour))
    tmp.append(n_access_byhour.index(min(n_access_byhour)))
    tmp.append(max(n_access_byhour))
    tmp.append(n_access_byhour.index(max(n_access_byhour)))
    # tmp.append(statistics.mean(n_access_byhour))
    tmp.append(statistics.median(n_access_byhour))
    tmp.append(statistics.pstdev(n_access_byhour))

    tmp.append(min(n_access_bydate))
    tmp.append(n_access_bydate.index(min(n_access_bydate)))
    tmp.append(max(n_access_bydate))
    tmp.append(n_access_bydate.index(max(n_access_bydate)))
    # tmp.append(statistics.mean(n_access_bydate))
    tmp.append(statistics.median(n_access_bydate))
    tmp.append(statistics.pstdev(n_access_bydate))
    
    tmp.append(min(n_flint_bydate))
    tmp.append(n_flint_bydate.index(min(n_flint_bydate)))
    tmp.append(max(n_flint_bydate))
    tmp.append(n_flint_bydate.index(max(n_flint_bydate)))
    # tmp.append(statistics.mean(n_flint_bydate))
    tmp.append(statistics.median(n_flint_bydate))
    tmp.append(statistics.pstdev(n_flint_bydate))
    
    TRAIN_feature.append(tmp)


In [None]:
TRAIN_num = []  # 训练集域名编号
TRAIN_feature = []  # 合并信息
TRAIN_label = []  # 域名标签

for fqdn in TRAIN_l_fqdn:
    url = fqdn[0]  # 域名
    num = int(fqdn[1][5:])  # 域名编号

    n_character = 0  # 字符个数
    n_digit = 0  # 数字个数
    n_normal = 0  # 普通字符个数
    n_special = 0  # 特殊字符个数
    n_alpha = 0  # 单词字母个数
    n_depth = 0  # 深度

    n_access = l_accesscount[num]  # 访问次数
    n_multiaccess = l_multiaccesscount[num]  # 连续访问次数
    n_access_ip = len(l_accessip[num])  # 访问 IP 数
    n_access_country = len(l_ipcountry[num])  # 访问国家数
    n_access_city = len(l_ipcity[num])  # 访问城市数
    n_access_isp = len(l_ipisp[num])  # 访问 ISP 数
    n_access_byhour = l_accesstime[num]  # 按小时统计访问次数
    n_access_bydate = l_accessdate[num]  # 按日期统计访问次数

    n_flint = l_flintcount[num]  # 解析次数
    n_flint_bydate = l_flintdate[num]  # 按日期统计解析次数

    is_normal_suffix = 0  # 是否是普通后缀
    commonRootList = ['cn', 'com', 'net', 'org', 'gov', 'info', 'edu']
    if url.split('.')[-1] in commonRootList:
        is_normal_suffix = 1

    is_alpha = False  # 是否为单词字母

    for i in range(len(url)):
        if url[i] == '[':
            is_alpha = True
        elif url[i] == ']':
            is_alpha = False
        else:
            n_character += 1

            if url[i].isalnum():
                n_normal += 1
            else:
                n_special += 1

            if url[i].isdigit():
                n_digit += 1

            if is_alpha:
                n_alpha += 1

            if url[i] == '.':
                n_depth += 1

    tmp = [n_character, n_digit, n_normal, n_special, n_alpha, n_depth, is_normal_suffix,
           n_access, n_multiaccess, n_access_ip, n_access_country, n_access_city, n_access_isp,
           n_flint]

    tmp.extend(n_access_byhour)
    tmp.extend(n_access_bydate)
    tmp.extend(n_flint_bydate)
    
    TRAIN_feature.append(tmp)


In [22]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import calinski_harabasz_score
from sklearn.decomposition import PCA

TRAIN_feature_normalization = MinMaxScaler().fit_transform(TRAIN_feature)

label_pred = KMeans(n_clusters=5).fit_predict(TRAIN_feature_normalization)

calinski_harabasz_score(TRAIN_feature, label_pred)


5.648149940873604

输出结果

In [28]:
# 输出结果
with open("result2.csv", "w") as csvfile:
    csvfile.write("fqdn_no,label\n")
    for i in range(len(label_pred)):
        csvfile.write("fqdn_" + str(i) + "," + str(label_pred[i]) + "\n")


In [23]:
from collections import Counter
num_Count = Counter(label_pred)
print(num_Count)

Counter({0: 5933, 1: 5752, 3: 2394, 2: 2165, 4: 1224})
