In [1]:
import os

import pandas as pd

In [2]:
# filter files starting with a given prefix and ending with a given suffix
def filter_files(prefix, suffix, files):
    return [f for f in files if f.startswith(prefix) and f.endswith(suffix)]

In [3]:
data_dir = "../data"
files = os.listdir(data_dir)
files

['training_set_2ap_loc0_nav82.csv',
 'training_set_2ap_loc1_nav86.csv',
 'training_set_3ap_loc31_nav86.csv',
 'training_set_3ap_loc33_nav82.csv',
 'training_set_3ap_loc33_nav88.csv',
 'training_set_3ap_loc30_nav82.csv',
 'WLAN组网中网络吞吐量建模.docx',
 'training_set_3ap_loc31_nav82.csv',
 'training_set_2ap_loc1_nav82.csv',
 'training_set_2ap_loc0_nav86.csv',
 'training_set_3ap_loc32_nav82.csv',
 'training_set_3ap_loc30_nav86.csv',
 'training_set_2ap_loc2_nav82.csv',
 'training_set_3ap_loc32_nav86.csv']

In [4]:
train_files = filter_files("train", ".csv", files)
train_files

['training_set_2ap_loc0_nav82.csv',
 'training_set_2ap_loc1_nav86.csv',
 'training_set_3ap_loc31_nav86.csv',
 'training_set_3ap_loc33_nav82.csv',
 'training_set_3ap_loc33_nav88.csv',
 'training_set_3ap_loc30_nav82.csv',
 'training_set_3ap_loc31_nav82.csv',
 'training_set_2ap_loc1_nav82.csv',
 'training_set_2ap_loc0_nav86.csv',
 'training_set_3ap_loc32_nav82.csv',
 'training_set_3ap_loc30_nav86.csv',
 'training_set_2ap_loc2_nav82.csv',
 'training_set_3ap_loc32_nav86.csv']

In [5]:
import re

data_dict = {}

pattern = re.compile(r"training_set_(\d+)ap_loc(\d+)_nav(\d+).csv")
for file in train_files:
    match = pattern.search(file)
    if match:
        item = dict(
            df=pd.read_csv(os.path.join(data_dir, file)),
            ap=int(match.group(1)),
            loc=int(match.group(2)),
            nav=int(match.group(3)),
        )
        data_dict[file] = item

In [6]:
# process data with ap = 2, merge all dataframes into one

for key, value in data_dict.items():
    if value["ap"] == 3:
        print(key, value["ap"], value["loc"], value["nav"])
# training_set_2ap_loc2_nav82.csv, drop last 2 rows
data_dict["training_set_2ap_loc2_nav82.csv"]["df"] = data_dict[
    "training_set_2ap_loc2_nav82.csv"
]["df"].iloc[:-2]

df_3ap = pd.concat([value["df"] for value in data_dict.values() if value["ap"] == 3])

training_set_3ap_loc31_nav86.csv 3 31 86
training_set_3ap_loc33_nav82.csv 3 33 82
training_set_3ap_loc33_nav88.csv 3 33 88
training_set_3ap_loc30_nav82.csv 3 30 82
training_set_3ap_loc31_nav82.csv 3 31 82
training_set_3ap_loc32_nav82.csv 3 32 82
training_set_3ap_loc30_nav86.csv 3 30 86
training_set_3ap_loc32_nav86.csv 3 32 86


In [7]:
df_3ap.columns

Index(['test_id', 'test_dur', 'loc_id', 'protocol', 'pkt_len', 'bss_id',
       'ap_name', 'ap_mac', 'ap_id', 'pd', 'ed', 'nav', 'eirp',
       'ap_from_ap_0_sum_ant_rssi', 'ap_from_ap_0_max_ant_rssi',
       'ap_from_ap_0_mean_ant_rssi', 'ap_from_ap_1_sum_ant_rssi',
       'ap_from_ap_1_max_ant_rssi', 'ap_from_ap_1_mean_ant_rssi',
       'ap_from_ap_2_sum_ant_rssi', 'ap_from_ap_2_max_ant_rssi',
       'ap_from_ap_2_mean_ant_rssi', 'sta_mac', 'sta_id',
       'sta_to_ap_0_sum_ant_rssi', 'sta_to_ap_0_max_ant_rssi',
       'sta_to_ap_0_mean_ant_rssi', 'sta_to_ap_1_sum_ant_rssi',
       'sta_to_ap_1_max_ant_rssi', 'sta_to_ap_1_mean_ant_rssi',
       'sta_to_ap_2_sum_ant_rssi', 'sta_to_ap_2_max_ant_rssi',
       'sta_to_ap_2_mean_ant_rssi', 'sta_from_ap_0_sum_ant_rssi',
       'sta_from_ap_0_max_ant_rssi', 'sta_from_ap_0_mean_ant_rssi',
       'sta_from_ap_1_sum_ant_rssi', 'sta_from_ap_1_max_ant_rssi',
       'sta_from_ap_1_mean_ant_rssi', 'sta_from_ap_2_sum_ant_rssi',
       'sta_from_ap_

In [11]:
import numpy as np


def assign_ap_from_ap_rssi(row):
    if row["ap_id"] == "ap_0":
        ap_A, ap_B = "ap_1", "ap_2"
    elif row["ap_id"] == "ap_1":
        ap_A, ap_B = "ap_0", "ap_2"
    elif row["ap_id"] == "ap_2":
        ap_A, ap_B = "ap_0", "ap_1"

    row["ap_from_ap_A_sum_rssi"] = row[f"ap_from_{ap_A}_sum_ant_rssi"]
    row["ap_from_ap_A_max_rssi"] = row[f"ap_from_{ap_A}_max_ant_rssi"]
    row["ap_from_ap_A_mean_rssi"] = row[f"ap_from_{ap_A}_mean_ant_rssi"]

    row["ap_from_ap_B_sum_rssi"] = row[f"ap_from_{ap_B}_sum_ant_rssi"]
    row["ap_from_ap_B_max_rssi"] = row[f"ap_from_{ap_B}_max_ant_rssi"]
    row["ap_from_ap_B_mean_rssi"] = row[f"ap_from_{ap_B}_mean_ant_rssi"]

    return row


def assign_sta_from_sta_rssi(row):
    if row["sta_id"] == "sta_0":
        sta_A, sta_B = "sta_1", "sta_2"
    elif row["sta_id"] == "sta_1":
        sta_A, sta_B = "sta_0", "sta_2"
    elif row["sta_id"] == "sta_2":
        sta_A, sta_B = "sta_0", "sta_1"

    row["sta_from_sta_A_rssi"] = row[f"sta_from_{sta_A}_rssi"]
    row["sta_from_sta_B_rssi"] = row[f"sta_from_{sta_B}_rssi"]

    return row


df_3ap = df_3ap.apply(assign_ap_from_ap_rssi, axis=1)
df_3ap = df_3ap.apply(assign_sta_from_sta_rssi, axis=1)

# drop columns with "ap_from_ap_A_sum_rssi" = nan or "ap_from_ap_B_sum_rssi" = nan
df_3ap = df_3ap.dropna(subset=["ap_from_ap_A_sum_rssi", "ap_from_ap_B_sum_rssi"])

In [12]:
import ast

import pywt
import scipy.stats as stats
from scipy.fft import fft


def approximate_entropy(U, m, r):
    """计算近似熵 (Approximate Entropy)"""
    if len(U) < m + 1:
        return np.nan  # 数据不足，无法计算近似熵

    def _phi(m):
        x = np.array([U[i : i + m] for i in range(len(U) - m + 1)])
        C = np.sum(np.abs(x[:, None] - x[None, :]).max(axis=2) <= r, axis=0) / (
            len(U) - m + 1
        )
        return np.log(C).sum() / (len(U) - m + 1)

    return _phi(m) - _phi(m + 1)


def grouping_entropy(column_data, num_bins):
    """分组熵"""
    if len(column_data) == 0:
        return np.nan
    hist, _ = np.histogram(column_data, bins=num_bins)
    probabilities = hist / len(column_data)
    probabilities = probabilities[probabilities > 0]  # 过滤掉零概率
    if len(probabilities) == 0:
        return np.nan  # 避免 log2(0) 的情况
    return -np.sum(probabilities * np.log2(probabilities))


def extract_statistics_for_column(column_data):
    """
    针对单个RSSI列的数据提取统计特征
    :param column_data: 某一列的数据，元素是列表
    :return: 统计特征的字典
    """
    column_data = ast.literal_eval(column_data)
    if len(column_data) == 0:
        return {"error": "empty data"}

    # print(column_data)
    statistics = {}

    ### 基础统计量 ###
    statistics["org"] = column_data
    statistics["length"] = len(column_data)  # 数据长度
    statistics["max"] = np.max(column_data)  # 最大值
    statistics["min"] = np.min(column_data)  # 最小值
    statistics["median"] = np.median(column_data)  # 中位数
    statistics["range"] = statistics["max"] - statistics["min"]  # 范围
    statistics["iqr"] = np.percentile(column_data, 75) - np.percentile(
        column_data, 25
    )  # 四分位距
    statistics["mean"] = np.mean(column_data)  # 平均值
    statistics["var"] = np.var(column_data)  # 方差

    # 判断数据是否几乎相同，避免计算偏度和峰度时的精度丢失
    if np.var(column_data) < 1e-8:  # 设置一个非常小的阈值
        statistics["kurtosis"] = np.nan  # 跳过峰度计算
        statistics["skewness"] = np.nan  # 跳过偏度计算
    else:
        try:
            statistics["kurtosis"] = stats.kurtosis(column_data)  # 峰度
            statistics["skewness"] = stats.skew(column_data)  # 偏度
        except RuntimeWarning:
            statistics["kurtosis"] = np.nan
            statistics["skewness"] = np.nan

    if len(column_data) > 1:
        statistics["rate_of_change"] = np.diff(column_data).mean()  # 变化率
        statistics["sum_absolute_diff"] = np.sum(
            np.abs(np.diff(column_data))
        )  # 差分绝对和
    else:
        statistics["rate_of_change"] = np.nan
        statistics["sum_absolute_diff"] = np.nan

    ### 复杂统计量 ###
    # 检查数据点数是否足够计算回归
    if len(column_data) > 1:
        time = np.arange(len(column_data))
        try:
            slope, intercept, _, _, _ = stats.linregress(time, column_data)
            statistics["trend"] = slope  # 信号的趋势
        except RuntimeWarning:
            statistics["trend"] = np.nan
    else:
        statistics["trend"] = np.nan

    # Entropy
    value_counts = np.unique(column_data, return_counts=True)[1]
    probabilities = value_counts / len(column_data)
    if len(probabilities) > 0:
        statistics["entropy"] = -np.sum(probabilities * np.log2(probabilities))  # 熵
    else:
        statistics["entropy"] = np.nan

    # SNR
    signal_power = np.mean(np.square(column_data))
    noise_power = np.var(column_data)
    statistics["snr"] = (
        signal_power / noise_power if noise_power != 0 else np.nan
    )  # 信噪比

    # 自相关系数
    if len(column_data) > 1:
        statistics["autocorrelation"] = np.corrcoef(column_data[:-1], column_data[1:])[
            0, 1
        ]  # 自相关系数
    else:
        statistics["autocorrelation"] = np.nan

    # Approximate Entropy
    statistics["approximate_entropy"] = approximate_entropy(
        column_data, 2, 0.2 * np.std(column_data)
    )  # 近似熵

    # Grouping Entropy
    statistics["grouping_entropy"] = grouping_entropy(column_data, 10)  # 分组熵

    ### 频域统计量 ###
    # Fourier Coefficients
    if len(column_data) > 1:
        fft_coefficients = np.abs(fft(column_data))
        statistics["fourier_coefficients"] = np.mean(fft_coefficients)  # 傅里叶系数均值
    else:
        statistics["fourier_coefficients"] = np.nan

    # Wavelet Transform
    def deal_coeff(coeffs):
        # compute l2 norm for each list in coeffs
        l2_norm = [np.linalg.norm(c) for c in coeffs]
        # return mean of l2 norm
        return np.mean(l2_norm)

    if len(column_data) > 1:
        coeffs = pywt.wavedec(column_data, "db1")
        statistics["wavelet_coefficients"] = deal_coeff(coeffs)
    else:
        statistics["wavelet_coefficients"] = np.nan

    # fill nan with 0
    for key in statistics:
        if key != "org" and pd.isna(statistics[key]):
            statistics[key] = 0
    return statistics

In [13]:
# 找出所有需要提取统计特征的RSSI列
rssi_columns = [
    col
    for col in df_3ap.columns
    if col.endswith("rssi") and not re.match(r"sta_from_sta|ap_from_ap_[012]", col)
]

rssi_columns
# 处理每个 RSSI 列
all_statistics = {}
new_columns_tupled = []
for col in rssi_columns:
    # 对每个 RSSI 列进行统计特征提取
    stats_d = df_3ap[col].apply(extract_statistics_for_column)  # 逐行提取统计特征

    # 将提取的统计特征展开并作为子列添加
    all_statistics[col] = pd.DataFrame(stats_d.tolist(), index=df_3ap.index)

    new_columns_tupled.extend([(col, stat) for stat in all_statistics[col].columns])

df_3ap.drop(columns=rssi_columns, inplace=True)

raw_columns_tupled = [(col, "_") for col in df_3ap.columns]
# 合并统计特征
for col, stats_df in all_statistics.items():
    df_3ap = pd.concat([df_3ap, stats_df.add_prefix(f"{col}_")], axis=1)

df_3ap.columns = pd.MultiIndex.from_tuples(raw_columns_tupled + new_columns_tupled)

ValueError: malformed node or string: nan

In [10]:
assert len(new_columns_tupled) == len(rssi_columns) * len(
    all_statistics[rssi_columns[0]].columns
)

# ('sta_to_ap_0_sum_ant_rssi', 'org')
# drop columns with "sum" in name[0]
df_3ap.drop(columns=[col for col in df_3ap.columns if "sum" in col[0]], inplace=True)

In [11]:
df_3ap.to_csv("./df_3ap_1.csv", index=False)