In [8]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# 示例文本分块（包含数值型字段）
text_chunks = [
    "user_age:25 diagnosis:diabetes",
    "user_age:34 diagnosis:hypertension",
    "user_age:19 diagnosis:normal",
    "user_age:21101010118 diagnosis:hypertension"
]

# 提取数值型特征（年龄、疾病标签编码）
def extract_numeric_features(texts):
    ages = []
    diagnoses = []
    for text in texts:
        parts = text.split()
        age = int(parts[0].split(":")[1])
        diagnosis = 1 if "diabetes" in text else 0  # 简化疾病编码
        ages.append(age)
        diagnoses.append(diagnosis)
    return np.array([ages, diagnoses]).T

features = extract_numeric_features(text_chunks)
print("原始特征矩阵:\n", features)

原始特征矩阵:
 [[         25           1]
 [         34           0]
 [         19           0]
 [21101010118           0]]


In [9]:
class LaplaceDP:
    def __init__(self, epsilon, sensitivity=1):
        self.epsilon = epsilon
        self.sensitivity = sensitivity  # 假设年龄最大差异为1（邻近数据集变化±1）

    def add_noise(self, data):
        scale = self.sensitivity / self.epsilon
        noise = np.random.laplace(loc=0, scale=scale, size=data.shape)
        return data + noise

# 应用差分隐私（ε=0.5，敏感度=1）
dp = LaplaceDP(epsilon=0.5, sensitivity=1)
noisy_features = dp.add_noise(features)
print("加噪后特征矩阵:\n", noisy_features)

加噪后特征矩阵:
 [[ 2.59971353e+01 -6.45555917e-01]
 [ 3.25619792e+01  1.92749224e+00]
 [ 1.93653717e+01 -4.38259783e+00]
 [ 2.11010101e+10  1.66125826e+00]]


In [10]:
def reconstruct_text(noisy_features):
    noisy_ages = np.round(noisy_features[:, 0]).astype(int)
    noisy_diagnoses = ["diabetes" if prob > 0.5 else "normal" for prob in noisy_features[:, 1]]
    return [f"user_age:{age} diagnosis:{diag}" for age, diag in zip(noisy_ages, noisy_diagnoses)]

noisy_texts = reconstruct_text(noisy_features)
print("脱敏后文本:\n", noisy_texts)

脱敏后文本:
 ['user_age:26 diagnosis:normal', 'user_age:33 diagnosis:diabetes', 'user_age:19 diagnosis:normal', 'user_age:21101010119 diagnosis:diabetes']
