In [15]:
import torch

# 打印Torch版本
print("Torch version:", torch.__version__)

# 打印CUDA版本（如果CUDA可用）
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available.")


Torch version: 2.5.1+cu121
CUDA version: 12.1


In [16]:
%cd /kaggle/input/all-datas

/kaggle/input/all-datas


In [18]:
#!pip uninstall -y torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric pyg-lib
# !pip install -U torch==2.5.1+cu121 torchvision==0.20.1+cu121 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
!pip install -q torch_geometric
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.5.1+cu121.html


Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html


# util

In [19]:
from torch_geometric.utils import add_remaining_self_loops, degree
from torch_scatter import scatter
import random
import torch
import os
import numpy as np
import torch.nn.functional as F
from torch_sparse import SparseTensor, matmul, fill_diag, sum as sparsesum, mul
import pandas as pd



def propagate(x, edge_index, edge_weight=None):
    edge_index, _ = add_remaining_self_loops(edge_index, num_nodes=x.size(0))

    # calculate the degree normalize term
    row, col = edge_index
    deg = degree(col, x.size(0), dtype=x.dtype)
    deg_inv_sqrt = deg.pow(-0.5)
    # for the first order appro of laplacian matrix in GCN, we use deg_inv_sqrt[row]*deg_inv_sqrt[col]
    if(edge_weight == None):
        edge_weight = deg_inv_sqrt[row] * deg_inv_sqrt[col]

    # normalize the features on the starting point of the edge
    out = edge_weight.view(-1, 1) * x[row]

    return scatter(out, edge_index[-1], dim=0, dim_size=x.size(0), reduce='add')

def propagate2(x, edge_index):
    edge_index, _ = add_remaining_self_loops(
        edge_index, num_nodes=x.size(0))

    # calculate the degree normalize term
    row, col = edge_index
    deg = degree(col, x.size(0), dtype=x.dtype)
    deg_inv_sqrt = deg.pow(-0.5)
    # for the first order appro of laplacian matrix in GCN, we use deg_inv_sqrt[row]*deg_inv_sqrt[col]
    edge_weight = deg_inv_sqrt[row] * deg_inv_sqrt[col]

    # normalize the features on the starting point of the edge
    out = edge_weight.view(-1, 1) * x[row]

    return scatter(out, edge_index[-1], dim=0, dim_size=x.size(0), reduce='add')


def seed_everything(seed=0):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.allow_tf32 = False

    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True
    # torch.use_deterministic_algorithms(True)


def fair_metric(pred, labels, sens):
    idx_s0 = sens == 0
    idx_s1 = sens == 1
    idx_s0_y1 = np.bitwise_and(idx_s0, labels == 1)
    idx_s1_y1 = np.bitwise_and(idx_s1, labels == 1)
    parity = abs(sum(pred[idx_s0]) / sum(idx_s0) -
                 sum(pred[idx_s1]) / sum(idx_s1))
    equality = abs(sum(pred[idx_s0_y1]) / sum(idx_s0_y1) -
                   sum(pred[idx_s1_y1]) / sum(idx_s1_y1))
    return parity.item(), equality.item()

def random_drop_edges(adj, drop_prob):
    mask = torch.rand(adj.size()) > drop_prob
    adj = adj * mask
    adj = adj + adj.t() - adj * adj.t()
    return adj


# def D(x1, x2):  # negative cosine similarity
#     return -F.cosine_similarity(x1, x2, dim=-1).mean()

def D(x1, x2):
    x1 = F.normalize(x1, dim=-1)
    x2 = F.normalize(x2, dim=-1)
    return 1 - F.cosine_similarity(x1, x2, dim=-1).mean()


def ContrastiveLoss(p, z, T=.07):
    # normalize
    p = F.normalize(p, dim=1)
    z = F.normalize(z, dim=1)
    # Einstein sum is more intuitive
    logits = torch.einsum('nc,mc->nm', [p, z]) / T
    labels = torch.arange(logits.shape[0], dtype=torch.long, device=logits.device)
    return nn.CrossEntropyLoss()(logits, labels)

def CE(p, z):
        return - (z.softmax(dim=1) * p.log_softmax(dim=1)).mean()


# dataset

In [20]:
import pandas as pd
import os
import numpy as np
import random
from torch_geometric.utils import from_scipy_sparse_matrix
import scipy.sparse as sp
from scipy.spatial import distance_matrix
from torch_geometric.data import Data
import torch
import scipy.sparse as sp


def index_to_mask(node_num, index):
    mask = torch.zeros(node_num, dtype=torch.bool)
    mask[index] = 1

    return mask


def sys_normalized_adjacency(adj):
    adj = sp.coo_matrix(adj)
    adj = adj + sp.eye(adj.shape[0])
    row_sum = np.array(adj.sum(1))
    row_sum = (row_sum == 0) * 1 + row_sum
    d_inv_sqrt = np.power(row_sum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

    return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)

    return torch.sparse.FloatTensor(indices, values, shape)


def feature_norm(features):
    min_values = features.min(axis=0)[0]
    max_values = features.max(axis=0)[0]
    return 2 * (features - min_values).div(max_values - min_values) - 1


def build_relationship(x, thresh=0.25):
    df_euclid = pd.DataFrame(
        1 / (1 + distance_matrix(x.T.T, x.T.T)), columns=x.T.columns, index=x.T.columns)
    df_euclid = df_euclid.to_numpy()
    idx_map = []
    for ind in range(df_euclid.shape[0]):
        max_sim = np.sort(df_euclid[ind, :])[-2]
        neig_id = np.where(df_euclid[ind, :] > thresh * max_sim)[0]
        import random
        random.seed(912)
        random.shuffle(neig_id)
        for neig in neig_id:
            if neig != ind:
                idx_map.append([ind, neig])
    # print('building edge relationship complete')
    idx_map = np.array(idx_map)

    return idx_map


def load_credit(dataset, sens_attr="Age", predict_attr="NoDefaultNextMonth", path="./credit/", label_number=1000):
    # print('Loading {} dataset from {}'.format(dataset, path))
    idx_features_labels = pd.read_csv(
        os.path.join(path, "{}.csv".format(dataset)))
    header = list(idx_features_labels.columns)
    header.remove(predict_attr)
    header.remove('Single')

    # sensitive feature removal
    # header.remove('Age')

#    # Normalize MaxBillAmountOverLast6Months
#    idx_features_labels['MaxBillAmountOverLast6Months'] = (idx_features_labels['MaxBillAmountOverLast6Months']-idx_features_labels['MaxBillAmountOverLast6Months'].mean())/idx_features_labels['MaxBillAmountOverLast6Months'].std()
#
#    # Normalize MaxPaymentAmountOverLast6Months
#    idx_features_labels['MaxPaymentAmountOverLast6Months'] = (idx_features_labels['MaxPaymentAmountOverLast6Months'] - idx_features_labels['MaxPaymentAmountOverLast6Months'].mean())/idx_features_labels['MaxPaymentAmountOverLast6Months'].std()
#
#    # Normalize MostRecentBillAmount
#    idx_features_labels['MostRecentBillAmount'] = (idx_features_labels['MostRecentBillAmount']-idx_features_labels['MostRecentBillAmount'].mean())/idx_features_labels['MostRecentBillAmount'].std()
#
#    # Normalize MostRecentPaymentAmount
#    idx_features_labels['MostRecentPaymentAmount'] = (idx_features_labels['MostRecentPaymentAmount']-idx_features_labels['MostRecentPaymentAmount'].mean())/idx_features_labels['MostRecentPaymentAmount'].std()
#
#    # Normalize TotalMonthsOverdue
#    idx_features_labels['TotalMonthsOverdue'] = (idx_features_labels['TotalMonthsOverdue']-idx_features_labels['TotalMonthsOverdue'].mean())/idx_features_labels['TotalMonthsOverdue'].std()

    # build relationship
    if os.path.exists(f'{path}/{dataset}_edges.txt'):
        edges_unordered = np.genfromtxt(
            f'{path}/{dataset}_edges.txt').astype('int')
    else:
        edges_unordered = build_relationship(
            idx_features_labels[header], thresh=0.7)
        np.savetxt(f'{path}/{dataset}_edges.txt', edges_unordered)

    relate_feature_names = [
        'EducationLevel',                # 受教育程度
        'Married',                       # 婚姻状态
        'MaxBillAmountOverLast6Months',  # 最近6个月最大账单
        'MaxPaymentAmountOverLast6Months', # 最近6个月最大支付
        'TotalMonthsOverdue'             # 逾期总月份
    ]
    relate_features = sp.csr_matrix(idx_features_labels[relate_feature_names], dtype=np.float32)

    
    features = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    header.remove('Age')
    print("Credit 数据集特征：", header)
    features_nosens = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    
    # print(features)
    labels = idx_features_labels[predict_attr].values

    idx = np.arange(features.shape[0])
    idx_map = {j: i for i, j in enumerate(idx)}
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=int).reshape(edges_unordered.shape)

    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    adj = adj + sp.eye(adj.shape[0])
    adj_norm = sys_normalized_adjacency(adj)
    adj_norm_sp = sparse_mx_to_torch_sparse_tensor(adj_norm)

    edge_index, _ = from_scipy_sparse_matrix(adj)

    features = torch.FloatTensor(np.array(features.todense()))
    relate_features = torch.FloatTensor(np.array(relate_features.todense()))
    features_nosens = torch.FloatTensor(np.array(features_nosens.todense()))
    labels = torch.LongTensor(labels)

    #
    #

    import random
    random.seed(20)
    label_idx_0 = np.where(labels == 0)[0]
    label_idx_1 = np.where(labels == 1)[0]
    random.shuffle(label_idx_0)
    random.shuffle(label_idx_1)

    idx_train = np.append(label_idx_0[:min(int(0.5 * len(label_idx_0)), label_number // 2)],
                          label_idx_1[:min(int(0.5 * len(label_idx_1)), label_number // 2)])
    idx_val = np.append(label_idx_0[int(0.5 * len(label_idx_0)):int(0.75 * len(
        label_idx_0))], label_idx_1[int(0.5 * len(label_idx_1)):int(0.75 * len(label_idx_1))])
    idx_test = np.append(label_idx_0[int(
        0.75 * len(label_idx_0)):], label_idx_1[int(0.75 * len(label_idx_1)):])

    sens = idx_features_labels[sens_attr].values.astype(int)
    sens = torch.LongTensor(sens)

    train_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_train))
    val_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_val))
    test_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_test))
    from collections import Counter
    print('predict_attr:',Counter(idx_features_labels[predict_attr]))
    print('sens_attr:',Counter(idx_features_labels[sens_attr]))
    return adj_norm_sp, edge_index, features, features_nosens, relate_features, labels, train_mask, val_mask, test_mask, sens, adj


def load_bail(dataset, sens_attr="WHITE", predict_attr="RECID", path="./bail/", label_number=1000):
    # print('Loading {} dataset from {}'.format(dataset, path))
    idx_features_labels = pd.read_csv(
        os.path.join(path, "{}.csv".format(dataset)))
    header = list(idx_features_labels.columns)
    header.remove(predict_attr)

    # sensitive feature removal
    # header.remove('WHITE')

    # # Normalize School
    # idx_features_labels['SCHOOL'] = 2*(idx_features_labels['SCHOOL']-idx_features_labels['SCHOOL'].min()).div(idx_features_labels['SCHOOL'].max() - idx_features_labels['SCHOOL'].min()) - 1

    # # Normalize RULE
    # idx_features_labels['RULE'] = 2*(idx_features_labels['RULE']-idx_features_labels['RULE'].min()).div(idx_features_labels['RULE'].max() - idx_features_labels['RULE'].min()) - 1

    # # Normalize AGE
    # idx_features_labels['AGE'] = 2*(idx_features_labels['AGE']-idx_features_labels['AGE'].min()).div(idx_features_labels['AGE'].max() - idx_features_labels['AGE'].min()) - 1

    # # Normalize TSERVD
    # idx_features_labels['TSERVD'] = 2*(idx_features_labels['TSERVD']-idx_features_labels['TSERVD'].min()).div(idx_features_labels['TSERVD'].max() - idx_features_labels['TSERVD'].min()) - 1

    # # Normalize FOLLOW
    # idx_features_labels['FOLLOW'] = 2*(idx_features_labels['FOLLOW']-idx_features_labels['FOLLOW'].min()).div(idx_features_labels['FOLLOW'].max() - idx_features_labels['FOLLOW'].min()) - 1

    # # Normalize TIME
    # idx_features_labels['TIME'] = 2*(idx_features_labels['TIME']-idx_features_labels['TIME'].min()).div(idx_features_labels['TIME'].max() - idx_features_labels['TIME'].min()) - 1

    # build relationship
    if os.path.exists(f'{path}/{dataset}_edges.txt'):
        edges_unordered = np.genfromtxt(
            f'{path}/{dataset}_edges.txt').astype('int')
    else:
        edges_unordered = build_relationship(
            idx_features_labels[header], thresh=0.6)
        np.savetxt(f'{path}/{dataset}_edges.txt', edges_unordered)

    relate_feature_names = [
        'SCHOOL',   # 教育水平或上学经历
        'WORKREL',  # 工作/就业关系
        'PROPTY',   # 财产状况
        'PRIORS',   # 过去犯罪记录数量
        'FELON'    # 是否有重罪记录
    ]
    relate_features = sp.csr_matrix(idx_features_labels[relate_feature_names], dtype=np.float32)

    features = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    header.remove('WHITE')
    print("Bail 数据集特征：", header)
    features_nosens = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    
    labels = idx_features_labels[predict_attr].values

    idx = np.arange(features.shape[0])
    idx_map = {j: i for i, j in enumerate(idx)}
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=int).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    adj = adj + sp.eye(adj.shape[0])
    adj_norm = sys_normalized_adjacency(adj)
    adj_norm_sp = sparse_mx_to_torch_sparse_tensor(adj_norm)

    edge_index, _ = from_scipy_sparse_matrix(adj)

    features = torch.FloatTensor(np.array(features.todense()))
    relate_features = torch.FloatTensor(np.array(relate_features.todense()))
    features_nosens = torch.FloatTensor(np.array(features_nosens.todense()))
    labels = torch.LongTensor(labels)

    # print(features)

    # features = normalize(features)
    # adj = adj + sp.eye(adj.shape[0])

    # features = torch.FloatTensor(np.array(features.todense()))
    # labels = torch.LongTensor(labels)

    import random
    random.seed(20)
    label_idx_0 = np.where(labels == 0)[0]
    label_idx_1 = np.where(labels == 1)[0]
    random.shuffle(label_idx_0)
    random.shuffle(label_idx_1)
    idx_train = np.append(label_idx_0[:min(int(0.5 * len(label_idx_0)), label_number // 2)],
                          label_idx_1[:min(int(0.5 * len(label_idx_1)), label_number // 2)])
    idx_val = np.append(label_idx_0[int(0.5 * len(label_idx_0)):int(0.75 * len(
        label_idx_0))], label_idx_1[int(0.5 * len(label_idx_1)):int(0.75 * len(label_idx_1))])
    idx_test = np.append(label_idx_0[int(
        0.75 * len(label_idx_0)):], label_idx_1[int(0.75 * len(label_idx_1)):])

    sens = idx_features_labels[sens_attr].values.astype(int)
    sens = torch.LongTensor(sens)
    train_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_train))
    val_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_val))
    test_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_test))
    from collections import Counter
    print('predict_attr:',Counter(idx_features_labels[predict_attr]))
    print('sens_attr:',Counter(idx_features_labels[sens_attr]))
    return adj_norm_sp, edge_index, features, features_nosens, relate_features, labels, train_mask, val_mask, test_mask, sens, adj


def load_german(dataset, sens_attr="Gender", predict_attr="GoodCustomer", path="./german/", label_number=1000):
    # print('Loading {} dataset from {}'.format(dataset, path))
    idx_features_labels = pd.read_csv(
        os.path.join(path, "{}.csv".format(dataset)))
    header = list(idx_features_labels.columns)
    header.remove(predict_attr)
    header.remove('OtherLoansAtStore')
    header.remove('PurposeOfLoan')

    # Sensitive Attribute
    idx_features_labels['Gender'][idx_features_labels['Gender']
                                  == 'Female'] = 1
    idx_features_labels['Gender'][idx_features_labels['Gender'] == 'Male'] = 0

#    for i in range(idx_features_labels['PurposeOfLoan'].unique().shape[0]):
#        val = idx_features_labels['PurposeOfLoan'].unique()[i]
#        idx_features_labels['PurposeOfLoan'][idx_features_labels['PurposeOfLoan'] == val] = i

#    # Normalize LoanAmount
#    idx_features_labels['LoanAmount'] = 2*(idx_features_labels['LoanAmount']-idx_features_labels['LoanAmount'].min()).div(idx_features_labels['LoanAmount'].max() - idx_features_labels['LoanAmount'].min()) - 1
#
#    # Normalize Age
#    idx_features_labels['Age'] = 2*(idx_features_labels['Age']-idx_features_labels['Age'].min()).div(idx_features_labels['Age'].max() - idx_features_labels['Age'].min()) - 1
#
#    # Normalize LoanDuration
#    idx_features_labels['LoanDuration'] = 2*(idx_features_labels['LoanDuration']-idx_features_labels['LoanDuration'].min()).div(idx_features_labels['LoanDuration'].max() - idx_features_labels['LoanDuration'].min()) - 1
#
    # build relationship
    if os.path.exists(f'{path}/{dataset}_edges.txt'):
        edges_unordered = np.genfromtxt(
            f'{path}/{dataset}_edges.txt').astype('int')
    else:
        edges_unordered = build_relationship(
            idx_features_labels[header], thresh=0.8)
        np.savetxt(f'{path}/{dataset}_edges.txt', edges_unordered)

    relate_features_names = [
        'Single',               # 婚姻状态
        'HasCoapplicant',       # 是否有共同贷款申请人
        'YearsAtCurrentJob_lt_1',  # 当前工作年限<1
        'YearsAtCurrentJob_geq_4', # 当前工作年限>=4
        'JobClassIsSkilled',    # 是否属于熟练职业
        'Age' ]                 # 年龄
 
    relate_features = sp.csr_matrix(idx_features_labels[relate_features_names], dtype=np.float32)
    
    features = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    
    header.remove('Gender')
    print("German 数据集特征：", header)
    features_nosens = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    
    labels = idx_features_labels[predict_attr].values
    labels[labels == -1] = 0

    idx = np.arange(features.shape[0])
    idx_map = {j: i for i, j in enumerate(idx)}
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=int).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)
    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    adj = adj + sp.eye(adj.shape[0])

    adj_norm = sys_normalized_adjacency(adj)
    adj_norm_sp = sparse_mx_to_torch_sparse_tensor(adj_norm)

    edge_index, _ = from_scipy_sparse_matrix(adj)

    features = torch.FloatTensor(np.array(features.todense()))
    relate_features = torch.FloatTensor(np.array(relate_features.todense()))
    features_nosens = torch.FloatTensor(np.array(features_nosens.todense()))
    
    labels = torch.LongTensor(labels)

    # features = torch.FloatTensor(np.array(features.todense()))
    # labels = torch.LongTensor(labels)

    import random
    random.seed(20)
    label_idx_0 = np.where(labels == 0)[0]
    label_idx_1 = np.where(labels == 1)[0]
    random.shuffle(label_idx_0)
    random.shuffle(label_idx_1)
    idx_train = np.append(label_idx_0[:min(int(0.5 * len(label_idx_0)), label_number // 2)],
                          label_idx_1[:min(int(0.5 * len(label_idx_1)), label_number // 2)])
    idx_val = np.append(label_idx_0[int(0.5 * len(label_idx_0)):int(0.75 * len(
        label_idx_0))], label_idx_1[int(0.5 * len(label_idx_1)):int(0.75 * len(label_idx_1))])
    idx_test = np.append(label_idx_0[int(
        0.75 * len(label_idx_0)):], label_idx_1[int(0.75 * len(label_idx_1)):])

    sens = idx_features_labels[sens_attr].values.astype(int)
    sens = torch.LongTensor(sens)

    train_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_train))
    val_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_val))
    test_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_test))
    from collections import Counter
    print('predict_attr:',Counter(idx_features_labels[predict_attr]))
    print('sens_attr:',Counter(idx_features_labels[sens_attr]))
    return adj_norm_sp, edge_index, features, features_nosens, relate_features, labels, train_mask, val_mask, test_mask, sens, adj

def load_pokec(dataset,sens_attr="region",predict_attr="I_am_working_in_field", path="./pokec/", label_number=3000,sens_number=500,seed=20,test_idx=True):
    """Load data"""
    print('Loading {} dataset from {}'.format(dataset,path))

    idx_features_labels = pd.read_csv(os.path.join(path,"{}.csv".format(dataset)))
    header = list(idx_features_labels.columns)
    header.remove("user_id")

    # header.remove(sens_attr)
    header.remove(predict_attr)

    if(dataset == 'region_job_2'):
        relate_feature_names = [
             # 语言
            'spoken_languages_indicator', 'anglicky', 'nemecky', 'rusky', 'francuzsky', 'taliansky', 'slovensky', 'japonsky',
             # 兴趣爱好
            'hobbies_indicator', 'sportovanie', 'pozeranie filmov', 'surfovanie po webe', 'turistika',
             # 饮食偏好
            'I_most_enjoy_good_food_indicator', 'I_like_specialties_from_kitchen_indicator',
             # 音乐偏好
            'I_mostly_like_listening_to_music_indicator']
    elif(dataset == 'region_job'):
        print("进来了")
        relate_feature_names = [
            'spoken_languages_indicator', 'anglicky', 'nemecky', 'rusky', 'francuzsky', 'talianskej', 'slovenskej', 'japonskej',
            'hobbies_indicator', 'sportovanie', 'pozeranie filmov', 'surfovanie po webe', 'turistika',
            'I_most_enjoy_good_food_indicator', 'I_like_specialties_from_kitchen_indicator',
            'I_mostly_like_listening_to_music_indicator']

    
    features = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    relate_features = sp.csr_matrix(idx_features_labels[relate_feature_names], dtype=np.float32)
    header.remove('region')
    print(dataset,"数据集特征：", header)
    features_nosens = sp.csr_matrix(idx_features_labels[header], dtype=np.float32)
    labels = idx_features_labels[predict_attr].values
    
    
    # build graph
    idx = np.array(idx_features_labels["user_id"], dtype=int)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt(os.path.join(path,"{}_relationship.txt".format(dataset)), dtype=int)

    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=int).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)
    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    # features = normalize(features)
    adj = adj + sp.eye(adj.shape[0])

    adj_norm = sys_normalized_adjacency(adj)
    adj_norm_sp = sparse_mx_to_torch_sparse_tensor(adj_norm)

    edge_index, _ = from_scipy_sparse_matrix(adj)

    features = torch.FloatTensor(np.array(features.todense()))
    relate_features = torch.FloatTensor(np.array(relate_features.todense()))
    features_nosens = torch.FloatTensor(np.array(features_nosens.todense()))
    # num_classes = len(idx_features_labels[predict_attr].unique()) - 1
    # labels = torch.eye(num_classes)[labels]
    labels = torch.LongTensor(labels)
    # adj = sparse_mx_to_torch_sparse_tensor(adj)

    # import random
    # random.seed(seed)
    # label_idx = np.where(labels>=0)[0]
    # random.shuffle(label_idx)

    # idx_train = label_idx[:min(int(0.5 * len(label_idx)),label_number)]
    # idx_val = label_idx[int(0.5 * len(label_idx)):int(0.75 * len(label_idx))]
    # if test_idx:
    #     idx_test = label_idx[label_number:]
    #     idx_val = idx_test
    # else:
    #     idx_test = label_idx[int(0.75 * len(label_idx)):]

    import random
    random.seed(20)
    label_idx_0 = np.where(labels == 0)[0]
    label_idx_1 = np.where(labels > 0)[0]
    random.shuffle(label_idx_0)
    random.shuffle(label_idx_1)
    idx_train = np.append(label_idx_0[:min(int(0.5 * len(label_idx_0)), label_number // 2)],
                          label_idx_1[:min(int(0.5 * len(label_idx_1)), label_number // 2)])
    idx_val = np.append(label_idx_0[int(0.5 * len(label_idx_0)):int(0.75 * len(
        label_idx_0))], label_idx_1[int(0.5 * len(label_idx_1)):int(0.75 * len(label_idx_1))])
    idx_test = np.append(label_idx_0[int(
        0.75 * len(label_idx_0)):], label_idx_1[int(0.75 * len(label_idx_1)):])


    sens = idx_features_labels[sens_attr].values

    sens_idx = set(np.where(sens >= 0)[0])
    idx_test = np.asarray(list(sens_idx & set(idx_test)))
    sens = torch.FloatTensor(sens)
    idx_sens_train = list(sens_idx - set(idx_val) - set(idx_test))

    random.shuffle(idx_sens_train)
    idx_sens_train = torch.LongTensor(idx_sens_train[:sens_number])


    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)
    train_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_train))
    val_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_val))
    test_mask = index_to_mask(features.shape[0], torch.LongTensor(idx_test))

    # pokec data division
    labels[labels>1]=1
    if sens_attr:
        sens[sens>0]=1
        
    from collections import Counter
    print('predict_attr:',Counter(idx_features_labels[predict_attr]))
    print('sens_attr:',Counter(idx_features_labels[sens_attr]))
    print('total dimension:', features.shape)
    # random.shuffle(sens_idx)

    return adj_norm_sp, edge_index, features, features_nosens, relate_features, labels, train_mask, val_mask, test_mask, sens, adj 

def get_dataset(dataname):
    if(dataname == 'credit'):
        load, label_num = load_credit, 6000
    elif(dataname == 'bail'):
        load, label_num = load_bail, 100
    elif(dataname == 'german'):
        load, label_num = load_german, 100
    elif(dataname == 'pokec_z'):
        dataname = 'region_job'
        load, label_num = load_pokec, 3000
    elif(dataname == 'pokec_n'):
        dataname = 'region_job_2'
        load, label_num = load_pokec, 3000

    adj_norm_sp, edge_index, features, features_nosens , relate_features, labels, train_mask, val_mask, test_mask, sens, adj = load(
        dataset=dataname, label_number=label_num)

    if(dataname == 'credit'):
        sens_idx = 1
    elif(dataname == 'bail' or dataname == 'german'):
        sens_idx = 0
    elif(dataname == 'region_job' or dataname == 'region_job_2'):
        sens_idx = 3

    x_max, x_min = torch.max(features, dim=0)[
        0], torch.min(features, dim=0)[0]
    
    if(dataname != 'german'):
        norm_features = feature_norm(features)
        norm_features[:, sens_idx] = features[:, sens_idx]
        features = norm_features

    #wwj
    x_one = torch.ones_like(features)
    x_no_s_one = features_nosens
     
    return Data(adj=adj, x=features, x_f = features, x_one=x_one, x_no_s_one = x_no_s_one, x_no_s = features_nosens, x_relate = relate_features, edge_index=edge_index, adj_norm_sp=adj_norm_sp, y=labels.float(), train_mask=train_mask, val_mask=val_mask, test_mask=test_mask, sens=sens ,pred_s = sens), sens_idx, x_min, x_max


# Generate pseudo-sensitive attributes

In [21]:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.decomposition import PCA

# ===== VGAE 模型部分（示例） =====
from torch_geometric.nn import VGAE, GCNConv
import torch.nn as nn
import torch.nn.functional as F




class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logvar = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)


def get_pseudo_sens(data,args):
    dataset_name = args.dataset  # 可换成 credit / bail / pokec_n / pokec_z
    print("===" + dataset_name + " ===")
    #data, sens_idx, x_min, x_max = get_dataset(dataset_name)

    if(args.input_x == 'all'):
        in_dim = data.x_no_s.shape[1]
        num_nodes = data.x_no_s.shape[0]
    elif (args.input_x == 'relate'):
        in_dim = data.x_relate.shape[1]
        num_nodes = data.x_relate.shape[0]
    
    hidden_dim = 16
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    encoder = GCNEncoder(in_dim, hidden_dim)
    model = VGAE(encoder).to(device)
    data = data.to(device)
    
    # VGAE 训练
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    model.train()
    for epoch in range(100):
        optimizer.zero_grad()
        if(args.input_x == 'all'):
            z = model.encode(data.x_no_s, data.edge_index)
        elif(args.input_x == 'relate'):
            z = model.encode(data.x_relate, data.edge_index)
        loss = model.recon_loss(z, data.edge_index)
        loss = loss + (1 / num_nodes) * model.kl_loss()
        loss.backward()
        optimizer.step()
        # if epoch % 20 == 0:
        #     print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
    
    # 得到节点嵌入
    model.eval()
    if(args.input_x == 'all'):
        z = model.encode(data.x_no_s, data.edge_index).detach().cpu().numpy()
    elif(args.input_x == 'relate'):
        z = model.encode(data.x_relate, data.edge_index).detach().cpu().numpy()

    
    # ===== 可选增强：PCA 降维 =====
    pca = PCA(n_components=4)  # 降到 4 维，也可以改成 2~4
    z_reduced = pca.fit_transform(z)

    # ====== 计算准确率 & 分布 ======
    true_sens = data.sens.cpu().numpy()
    

    if(args.jl == 'GMM'):
        # 保存到 data
        # ====== GMM 聚类 ======
        gmm = GaussianMixture(n_components=2, covariance_type="full", random_state=42, n_init=10)
        pred_s = gmm.fit_predict(z_reduced)

        # ===== 可选增强：平滑扰动 =====
        # 将 pred_s 转为 float，加一点随机噪声，再取 0/1
        noise_strength = 0.05  # 噪声强度，可调
        pred_s = pred_s.astype(float) + np.random.uniform(-noise_strength, noise_strength, size=pred_s.shape)
        pred_s = (pred_s > 0.5).astype(int)  # 重新二值化

        acc_gmm = max(accuracy_score(true_sens, pred_s),accuracy_score(true_sens, 1 - pred_s) )
        
        print("GMM 预测准确率:", acc_gmm)
        print("GMM 伪s分布:", Counter(pred_s))
        
    else:
        # ====== KMeans 聚类 ======
        kmeans = KMeans(n_clusters=2, random_state=42, n_init=20)
        pred_s = kmeans.fit_predict(z_reduced)

        # ===== 可选增强：平滑扰动 =====
        # 将 pred_s 转为 float，加一点随机噪声，再取 0/1
        noise_strength = 0.05  # 噪声强度，可调
        pred_s = pred_s.astype(float) + np.random.uniform(-noise_strength, noise_strength, size=pred_s.shape)
        pred_s = (pred_s > 0.5).astype(int)  # 重新二值化
        
        # KMeans 可能会有 label 对调问题，所以取 max(准确率, 1-准确率)
        acc_kmeans = max(
            accuracy_score(true_sens, pred_s),
            accuracy_score(true_sens, 1 - pred_s)
        )
        print("KMeans 预测准确率:", acc_kmeans)
        print("KMeans 伪s分布:", Counter(pred_s))
        
    
    pred_s = torch.tensor(pred_s).to(device)
    return pred_s


# model

In [22]:
from torch.nn import Linear
import torch.nn.functional as F
from torch import nn
from torch.nn import Parameter
from torch_geometric.nn import GINConv, SAGEConv
from torch.nn.utils import spectral_norm


class MLP_projector(torch.nn.Module):
    def __init__(self, args):
        super(MLP_projector, self).__init__()
        self.args = args

        self.lin = Linear(args.hidden * 2, args.hidden)

    def reset_parameters(self):
        self.lin.reset_parameters()

    def forward(self, x, edge_index=None, mask_node=None):
        h = self.lin(x)

        return h


class MLP_discriminator(torch.nn.Module):
    def __init__(self, args):
        super(MLP_discriminator, self).__init__()
        self.args = args

        self.lin = Linear(args.hidden, 1)

    def reset_parameters(self):
        self.lin.reset_parameters()

    def forward(self, h, edge_index=None, mask_node=None):
        h = self.lin(h)

        return torch.sigmoid(h)


class MLP_encoder(torch.nn.Module):
    def __init__(self, args):
        super(MLP_encoder, self).__init__()
        self.args = args

        self.lin = Linear(args.num_features, args.hidden)

    def reset_parameters(self):
        self.lin.reset_parameters()

    def forward(self, x, edge_index=None, mask_node=None):
        h = self.lin(x)

        return h


class GCN_encoder_scatter(torch.nn.Module):
    def __init__(self, args):
        super(GCN_encoder_scatter, self).__init__()

        self.args = args

        self.lin = Linear(args.num_features, args.hidden, bias=False)

        self.bias = Parameter(torch.Tensor(args.hidden))

    def reset_parameters(self):
        self.lin.reset_parameters()
        self.bias.data.fill_(0.0)

    def forward(self, x, edge_index, adj_norm_sp):
        h = self.lin(x)
        h = propagate2(h, edge_index) + self.bias
        return h


class GCN_encoder_spmm(torch.nn.Module):
    def __init__(self, args):
        super(GCN_encoder_spmm, self).__init__()

        self.args = args

        self.lin = Linear(args.num_features, args.hidden, bias=False)
        self.bias = Parameter(torch.Tensor(args.hidden))

    def reset_parameters(self):
        self.lin.reset_parameters()
        self.bias.data.fill_(0.0)

    def forward(self, x, edge_index, adj_norm_sp):
        h = self.lin(x)
        h = torch.spmm(adj_norm_sp, h) + self.bias

        return h


class GIN_encoder(nn.Module):
    def __init__(self, args):
        super(GIN_encoder, self).__init__()

        self.args = args

        self.mlp = nn.Sequential(
            nn.Linear(args.num_features, args.hidden),
            # nn.ReLU(),
            nn.BatchNorm1d(args.hidden),
            # nn.Linear(args.hidden, args.hidden),
        )

        self.conv = GINConv(self.mlp)

    def reset_parameters(self):
        self.conv.reset_parameters()

    def forward(self, x, edge_index, adj_norm_sp):
        h = self.conv(x, edge_index)
        return h


class SAGE_encoder(nn.Module):
    def __init__(self, args):
        super(SAGE_encoder, self).__init__()

        self.args = args

        self.conv1 = SAGEConv(args.num_features, args.hidden, normalize=True)
        self.conv1.aggr = 'mean'
        self.transition = nn.Sequential(
            nn.ReLU(),
            nn.BatchNorm1d(args.hidden),
            nn.Dropout(p=args.dropout)
        )
        self.conv2 = SAGEConv(args.hidden, args.hidden, normalize=True)
        self.conv2.aggr = 'mean'

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, x, edge_index, adj_norm_sp):
        x = self.conv1(x, edge_index)
        x = self.transition(x)
        h = x
        # h = self.conv2(x, edge_index)
        return h


class MLP_classifier(torch.nn.Module):
    def __init__(self, args):
        super(MLP_classifier, self).__init__()
        self.args = args

        self.lin = Linear(args.hidden, args.num_classes)

    def reset_parameters(self):
        self.lin.reset_parameters()

    def forward(self, h, edge_index=None):
        h = self.lin(h)

        return h


# evaluation

In [23]:
import torch.nn.functional as F
import torch
from sklearn.metrics import f1_score, roc_auc_score

def evaluate(x, classifier, hp, encoder_m, encoder_g, projector, data, args):
    classifier.eval()
    encoder_m.eval()
    encoder_g.eval()
    projector.eval()

    with torch.no_grad():
        if(args.have_s == 'yes' or args.have_s == 'preds'):
            h_m = encoder_m(data.x , data.edge_index, data.adj_norm_sp)
            h_g = encoder_g(data.x_one, data.edge_index, data.adj_norm_sp)
            #print("eval input features have sens")
        elif (args.have_s == 'nos'):
            h_m = encoder_m(data.x_no_s , data.edge_index, data.adj_norm_sp)
            h_g = encoder_g(data.x_no_s_one, data.edge_index, data.adj_norm_sp)
            #print("eval input features don't have  sens ")
        
        h_c = torch.cat((h_m, h_g), dim=1)
        h = projector(h_c)
        output = classifier(h)

    accs, auc_rocs, F1s, paritys, equalitys = {}, {}, {}, {}, {}

    pred_val = (output[data.val_mask].squeeze() > 0).type_as(data.y)
    pred_test = (output[data.test_mask].squeeze() > 0).type_as(data.y)

    accs['val'] = pred_val.eq(
        data.y[data.val_mask]).sum().item() / data.val_mask.sum().item()
    accs['test'] = pred_test.eq(
        data.y[data.test_mask]).sum().item() / data.test_mask.sum().item()

    F1s['val'] = f1_score(data.y[data.val_mask].cpu(
    ).numpy(), pred_val.cpu().numpy())

    F1s['test'] = f1_score(data.y[data.test_mask].cpu(
    ).numpy(), pred_test.cpu().numpy())

    auc_rocs['val'] = roc_auc_score(
        data.y[data.val_mask].cpu().numpy(), output[data.val_mask].detach().cpu().numpy())
    auc_rocs['test'] = roc_auc_score(
        data.y[data.test_mask].cpu().numpy(), output[data.test_mask].detach().cpu().numpy())
    
    
    paritys['val'], equalitys['val'] = fair_metric(pred_val.cpu().numpy(), data.y[data.val_mask].cpu(
    ).numpy(), data.pred_s[data.val_mask].cpu().numpy())
    paritys['test'], equalitys['test'] = fair_metric(pred_test.cpu().numpy(), data.y[data.test_mask].cpu(
    ).numpy(), data.sens[data.test_mask].cpu().numpy())

    return accs, auc_rocs, F1s, paritys, equalitys


# DFN

In [24]:
import argparse
from tqdm import tqdm
from torch import tensor
import warnings

warnings.filterwarnings('ignore')
import math

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from torch.optim.lr_scheduler import ExponentialLR
import time
from memory_profiler import memory_usage


class MLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def reset_parameters(self):
        self.fc1.reset_parameters()
        self.fc2.reset_parameters()
        self.fc3.reset_parameters()


def run(data, args):
    pbar = tqdm(range(args.runs), unit='run')
    criterion = nn.BCELoss()
    acc, f1, auc_roc, parity, equality = np.zeros(args.runs), np.zeros(
        args.runs), np.zeros(args.runs), np.zeros(args.runs), np.zeros(args.runs)

    data = data.to(args.device)

    # projector
    projector = MLP_projector(args).to(args.device)
    optimizer_p = torch.optim.Adam([
        dict(params=projector.lin.parameters(), weight_decay=args.e_wd)], lr=args.e_lr)

    #discriminator
    discriminator = MLP_discriminator(args).to(args.device)
    optimizer_d = torch.optim.Adam([
        dict(params=discriminator.lin.parameters(), weight_decay=args.d_wd)], lr=args.d_lr)

    # classifier
    classifier = MLP_classifier(args).to(args.device)
    optimizer_c = torch.optim.Adam([
        dict(params=classifier.lin.parameters(), weight_decay=args.c_wd)], lr=args.c_lr)

    # original feature + MLP
    encoder_m = MLP_encoder(args).to(args.device)
    optimizer_em = torch.optim.Adam([
        dict(params=encoder_m.lin.parameters(), weight_decay=args.e_wd)], lr=args.e_lr)

    # # original feature + MLP
    # encoder_mf = MLP_encoder(args).to(args.device)
    # optimizer_emf = torch.optim.Adam([
    #     dict(params=encoder_m.lin.parameters(), weight_decay=args.e_wd)], lr=args.e_lr)
    
    # all-one feature + MLP
    if args.prop == 'scatter':
        encoder_g = GCN_encoder_scatter(args).to(args.device)
    else:
        encoder_g = GCN_encoder_spmm(args).to(args.device)
    optimizer_eg = torch.optim.Adam([
        dict(params=encoder_g.lin.parameters(), weight_decay=args.e_wd),
        dict(params=encoder_g.bias, weight_decay=args.e_wd)], lr=args.e_lr)



    for count in pbar:
        seed_everything(count + args.seed)
        discriminator.reset_parameters()
        classifier.reset_parameters()
        encoder_m.reset_parameters()
        #encoder_mf.reset_parameters()
        encoder_g.reset_parameters()
        projector.reset_parameters()

        best_val_tradeoff = 0
        best_val_loss = math.inf

        for epoch in range(0, args.epochs):
            # train classifier
            classifier.train()
            encoder_m.train()
            #encoder_mf.train()
            encoder_g.train()
            projector.train()
            for epoch_c in range(0, args.c_epochs):
                optimizer_c.zero_grad()
                optimizer_em.zero_grad()
                #optimizer_emf.zero_grad()
                optimizer_eg.zero_grad()

                if(args.have_s == 'yes' or args.have_s == 'preds'):
                    h_m = encoder_m(data.x , data.edge_index, data.adj_norm_sp)
                    h_mf = encoder_m(data.x_f,data.edge_index, data.adj_norm_sp)
                    h_g = encoder_g(data.x_one , data.edge_index, data.adj_norm_sp)
                    #print("train input features have sens")
                elif args.have_s == 'nos' :
                    h_m = encoder_m(data.x_no_s , data.edge_index, data.adj_norm_sp)
                    h_g = encoder_g(data.x_no_s_one , data.edge_index, data.adj_norm_sp)
                    #print("train input features don't have  sens ")

                # l1 = D(h_m[data.train_mask], h_mf[data.train_mask]) / 2
                # l2 = D(h_mf[data.train_mask], h_m[data.train_mask]) / 2
                # l1 = ContrastiveLoss(h_m[data.train_mask], h_mf[data.train_mask]) / 2
                # l2 = ContrastiveLoss(h_mf[data.train_mask], h_m[data.train_mask]) / 2
                # l1 = CE(h_m[data.train_mask], h_mf[data.train_mask]) / 2
                # l2 = CE(h_mf[data.train_mask], h_m[data.train_mask]) / 2
                # align_loss_h = args.lambda_align * (l1 + l2)
                
                h_c = torch.cat((h_m, h_g), dim=1)
                h_cf = torch.cat((h_mf, h_g), dim=1)
                h = projector(h_c)
                hf = projector(h_cf)

                # 表征对齐
                align_loss_h = D(h[data.train_mask], hf[data.train_mask])
                
                output = classifier(h)
                outputf = classifier(hf)

                # 更合理的替代版本 预测对齐
                p = torch.sigmoid(output)
                q = torch.sigmoid(outputf)
                align_loss_y = 0.5 * (
                    F.binary_cross_entropy(q, p.detach()) +
                    F.binary_cross_entropy(p, q.detach())
                )

                
                #align_loss_y = F.mse_loss(output[data.train_mask], outputf[data.train_mask])
                #align_loss_y = F.binary_cross_entropy_with_logits(output[data.train_mask], outputf[data.train_mask])
                loss_c1 = F.binary_cross_entropy_with_logits(output[data.train_mask], data.y[data.train_mask].unsqueeze(1).to(args.device))/2
                loss_c2 = F.binary_cross_entropy_with_logits(outputf[data.train_mask], data.y[data.train_mask].unsqueeze(1).to(args.device))/2

                # 总损失
                if(args.flip == 'no'):
                    loss_c = loss_c1
                else:
                    #loss_c = loss_c1 + align_loss_h
                    #loss_c = loss_c1
                    #loss_c = loss_c1 + loss_c2 + args.lambda_align*align_loss_y 
                    loss_c = loss_c1 + loss_c2 + args.lambda_h * align_loss_h + args.lambda_y * align_loss_y 

                loss_c.backward()

                optimizer_em.step()
                optimizer_eg.step()
                optimizer_c.step()
                optimizer_p.step()

            

            # evaluate classifier
            accs, auc_rocs, F1s, tmp_parity, tmp_equality = evaluate(
                data.x, classifier, discriminator, encoder_m, encoder_g, projector, data, args)

            # print(epoch, 'Acc:', accs['test'], 'F1:', F1s['test'],
            #       'Parity:', tmp_parity['test'], 'Equality:', tmp_equality['test'], 'tradeoff:',
            #       auc_rocs['test'] + F1s['test'] + accs['test'] - args.alpha * (
            #                   tmp_parity['test'] + tmp_equality['test']))
            # if auc_rocs['val'] + F1s['val'] + accs['val'] - args.alpha * (
            #         tmp_parity['val'] + tmp_equality['val']) > best_val_tradeoff:

            # 验证集的敏感属性用的是预测的sens
            if auc_rocs['val'] + F1s['val'] + accs['val']  - args.alpha * (
                     tmp_parity['val'] + tmp_equality['val']) > best_val_tradeoff:
                test_acc = accs['test']
                test_auc_roc = auc_rocs['test']
                test_f1 = F1s['test']
                test_parity, test_equality = tmp_parity['test'], tmp_equality['test']
                # print('best_val_tradeoff', epoch)
                best_val_tradeoff = auc_rocs['val'] + F1s['val'] + accs['val'] - args.alpha * (
                            tmp_parity['val'] + tmp_equality['val'])

        acc[count] = test_acc
        f1[count] = test_f1
        auc_roc[count] = test_auc_roc
        parity[count] = test_parity
        equality[count] = test_equality

    return acc, f1, auc_roc, parity, equality




# credit

In [27]:
# 替换 argparse 的部分
import torch
import numpy as np
from itertools import product

# 模拟 argparse.Namespace，手动赋值
class Args:
    dataset = 'credit'
    runs = 5
    epochs = 10
    d_epochs = 5
    c_epochs = 10
    d_lr = 0.01
    d_wd = 0
    c_lr = 0.1
    c_wd = 0.001
    e_lr = 0.1
    e_wd = 0.001
    prop = 'scatter'
    dropout = 0.5
    hidden = 18
    seed = 0
    alpha = 1
    delta = 0.5
    m_epoch = 20
    d = 'no'
    have_s = 'preds' # preds , yes-with s
    jl = 'Kmeans' #  Kmeans
    input_x = 'relate' 
    lambda_h = 4 
    lambda_y = 1
    m_lr = 0.1
    flip = 'yes' 
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

args = Args()

# 获取数据
data, args.sens_idx, args.x_min, args.x_max = get_dataset(args.dataset)

# # 参数搜索范围
# lambda_y_list = [0.1, 0.2, 0.4, 0.6, 0.8, 1]
# lambda_h_list = [0.01, 0.02, 0.04, 0.06, 0.08,0.1]
# alpha_list = [0, 0.2, 0.4, 0.6, 0.8, 1 ]
# for lambda_y_val,lambda_h_val in product(lambda_y_list,lambda_h_list):
    
# 设置当前的lambda_align值
# args.lambda_y = lambda_y_val
# args.lambda_h = lambda_h_val

if args.have_s == 'nos':
    args.num_features, args.num_classes = data.x_no_s.shape[1], 2 - 1  # binary classes are 0,1
    print("numfeatures:",args.num_features)
elif args.have_s == 'yes':
    args.num_features, args.num_classes = data.x.shape[1], 2 - 1  # binary classes are 0,1
else :  # hava_s == 'preds'
    
    data.pred_s = get_pseudo_sens(data, args)
    
    # 替换敏感属性列
    modified_x = data.x.clone()
    modified_x[:, args.sens_idx] = data.pred_s  # 确保类型匹配
    # 更新data对象中的x
    data.x = modified_x
    
    args.num_features, args.num_classes = data.x.shape[1], 2 - 1  # binary classes are 0,1

# 运行主程序
acc, f1, auc_roc, parity, equality = run(data, args)

# 输出结果
print('======' + args.dataset + '======')
print(f"lambda_y={args.lambda_y},lambda_y={args.lambda_h},alpha={args.alpha}, delta={args.delta}, c_lr={args.c_lr}, e_lr={args.e_lr} ")
print('f1:', round(np.mean(f1) * 100, 2), '±', round(np.std(f1) * 100, 2), sep='')
print('Auc:', round(np.mean(auc_roc) * 100, 2), '±', round(np.std(auc_roc) * 100, 2), sep='')
print('Acc:', round(np.mean(acc) * 100, 2), '±', round(np.std(acc) * 100, 2), sep='')
print('parity:', round(np.mean(parity) * 100, 2), '±', round(np.std(parity) * 100, 2), sep='')
print('equality:', round(np.mean(equality) * 100, 2), '±', round(np.std(equality) * 100, 2), sep='')


Credit 数据集特征： ['Married', 'EducationLevel', 'MaxBillAmountOverLast6Months', 'MaxPaymentAmountOverLast6Months', 'MonthsWithZeroBalanceOverLast6Months', 'MonthsWithLowSpendingOverLast6Months', 'MonthsWithHighSpendingOverLast6Months', 'MostRecentBillAmount', 'MostRecentPaymentAmount', 'TotalOverdueCounts', 'TotalMonthsOverdue', 'HistoryOfOverduePayments']
predict_attr: Counter({1: 23364, 0: 6636})
sens_attr: Counter({0: 27315, 1: 2685})
===credit ===
KMeans 预测准确率: 0.8707666666666667
KMeans 伪s分布: Counter({np.int64(0): 28736, np.int64(1): 1264})


100%|██████████| 5/5 [00:11<00:00,  2.27s/run]

lambda_y=1,lambda_y=4,alpha=1, delta=0.5, c_lr=0.1, e_lr=0.1 
f1:86.53±1.6
Auc:71.12±2.15
Acc:77.66±1.14
parity:5.24±4.47
equality:3.95±3.67



