In [487]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import tensorflow as tf
from tensorflow import keras
import torch.nn as nn
import torch.nn.functional as F

In [488]:
!pip install torch
!pip install torch torchvision



In [489]:
adult_data = pd.read_csv("adult_test.csv")

In [490]:
# Define a function to check if the subset satisfies k-anonymity
def is_k_anonymous(subset, k):
    return len(subset) >= k

In [491]:
def calculate_range(data, feature):
    """Calculate the value range of a feature."""
    if data[feature].dtype in [np.int64, np.float64]:  # 使用NumPy的數據類型來進行檢查
        return data[feature].max() - data[feature].min()
    else:  # For non-numeric features
        return -np.inf  # Return a very small value to ensure this feature is not selected

def split_on_feature(data, k):
    """Split the data based on the feature with the largest value range."""
    features = data.columns
    max_range = -float("inf")
    split_feature = None

    # Calculate the value range for each feature and find the feature with the largest range
    for feature in features:
        feature_range = calculate_range(data, feature)
        if feature_range > max_range:
            max_range = feature_range
            split_feature = feature

    # If we didn't find a feature to split on
    if split_feature is None:
        return data, pd.DataFrame()

    # Split the data based on the median value of the selected feature
    if data[split_feature].dtype in [int, float]:
        median_value = data[split_feature].median()
        subset1 = data[data[split_feature] < median_value]
        subset2 = data[data[split_feature] >= median_value]
    else:  # For categorical features, split based on whether the value is in the top 50% most common values
        top_values = data[split_feature].value_counts().index[:len(data[split_feature].unique()) // 2]
        subset1 = data[data[split_feature].isin(top_values)]
        subset2 = data[~data[split_feature].isin(top_values)]

    return subset1, subset2


In [492]:
def anonymize_feature(data, feature):
    """Anonymize a numerical feature by replacing its values with a range."""
    if data[feature].dtype in [int, float]:  # Only apply for numerical features
        min_val = data[feature].min()
        max_val = data[feature].max()
        range_str = f"{min_val}-{max_val}"
        data[feature] = range_str
    return data

In [493]:
def mondrian_k_anonymization(data, k):
    queue = [data]
    results = []

    while queue:
        partition = queue.pop(0)

        subset1, subset2 = split_on_feature(partition, k)

        # 如果分割結果其中一個子集的大小小於k，則將原始分區添加到結果中
        if len(subset1) < k or len(subset2) < k:
            for feature in partition.columns:
                partition = anonymize_feature(partition, feature)
            results.append(partition)
            continue
        else:  # 如果兩個子集都滿足k-匿名性，則繼續分割它們
            queue.append(subset1)
            queue.append(subset2)

    anonymized_data = pd.concat(results, axis=0, ignore_index=True)
    return anonymized_data

anonymized_data = mondrian_k_anonymization(adult_data, 50)

In [494]:
def revert_range_to_mean(data, feature):
    """Revert a range feature to its mean value."""
    # Check if the feature contains "-" and if the first split result can be converted to a float
    if "-" in str(data[feature].iloc[0]):
        try:
            # Attempt to split and convert
            min_vals, max_vals = data[feature].str.split("-", expand=True).astype(float).values.T
            data[feature] = 0.5 * (min_vals + max_vals)  # Take the average of the min and max
        except ValueError:  # If conversion to float fails, it's not a numerical range and we skip it
            pass
    return data


In [495]:
for column in anonymized_data.columns:
    anonymized_data = revert_range_to_mean(anonymized_data, column)

In [496]:
# Step 1.1: Load the training data
train_data = pd.read_csv('adult_data.csv')

# Step 1.2: Encode categorical features and the target variable
label_encoders = {}

for column in train_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    label_encoders[column] = le

# Apply the same encoders to the anonymized_data
for column in anonymized_data.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        le = label_encoders[column]
        anonymized_data[column] = le.transform(anonymized_data[column])

# Step 1.3: Split the dataset into features and target
X_train = train_data.drop('income', axis=1)
y_train = train_data['income']

# Step 1.4: Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(anonymized_data.drop('income', axis=1))
y_test = anonymized_data['income']

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.int64)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.int64)

In [497]:

# Step 2.1: Define the neural network
class IncomeClassifier(nn.Module):
    def __init__(self, input_dim):
        super(IncomeClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)  # Two classes: <=50K and >50K

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model
input_dim = X_train_tensor.shape[1]
model = IncomeClassifier(input_dim)

In [498]:
# Step 3.1: Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 3.2: Set training parameters
epochs = 10
batch_size = 64

# Step 3.3: Train the model
for epoch in range(epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        # Get mini-batch of data
        inputs = X_train_tensor[i:i+batch_size]
        labels = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

print("Training completed.")

Epoch [1/10], Loss: 0.2880
Epoch [2/10], Loss: 0.2706
Epoch [3/10], Loss: 0.2588
Epoch [4/10], Loss: 0.2496
Epoch [5/10], Loss: 0.2459
Epoch [6/10], Loss: 0.2443
Epoch [7/10], Loss: 0.2426
Epoch [8/10], Loss: 0.2401
Epoch [9/10], Loss: 0.2368
Epoch [10/10], Loss: 0.2340
Training completed.


In [499]:
# Step 4.1: Test the model
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs.data, 1)

In [500]:
# Convert tensors to numpy arrays for metric calculations
predicted_np = predicted.numpy()
y_test_np = y_test_tensor.numpy()

# Accuracy
accuracy = (predicted_np == y_test_np).sum() / len(y_test_np)

# Misclassification Error
misclassification_error = 1 - accuracy

# Precision
precision = precision_score(y_test_np, predicted_np)

# Recall
recall = recall_score(y_test_np, predicted_np)

# AUC
# First, get the predicted probabilities for the positive class
probabilities = torch.nn.functional.softmax(test_outputs, dim=1)
prob_pos_class = probabilities[:, 1].numpy()
auc = roc_auc_score(y_test_np, prob_pos_class)

print(f"Misclassification Error: {misclassification_error:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUC: {auc:.4f}")

Misclassification Error: 0.7617
Accuracy: 0.2383
Precision: 0.2367
Recall: 0.9995
AUC: 0.4898


In [501]:
# Step 1.1: Load the data
train_data = pd.read_csv('adult_data.csv')
test_data = pd.read_csv('adult_test.csv')

# Step 1.2: Encode categorical features and the target variable
label_encoders = {}

for column in train_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    test_data[column] = le.transform(test_data[column])
    label_encoders[column] = le

# Step 1.3: Split the dataset into features and target
X_train = train_data.drop('income', axis=1)
y_train = train_data['income']
X_test = test_data.drop('income', axis=1)
y_test = test_data['income']

# Step 1.4: Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.int64)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.int64)

In [502]:

# Step 2.1: Define the neural network
class IncomeClassifier(nn.Module):
    def __init__(self, input_dim):
        super(IncomeClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)  # Two classes: <=50K and >50K

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model
input_dim = X_train_tensor.shape[1]
model = IncomeClassifier(input_dim)

In [503]:
# Step 3.1: Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 3.2: Set training parameters
epochs = 10
batch_size = 64

# Step 3.3: Train the model
for epoch in range(epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        # Get mini-batch of data
        inputs = X_train_tensor[i:i+batch_size]
        labels = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

print("Training completed.")

Epoch [1/10], Loss: 0.2890
Epoch [2/10], Loss: 0.2693
Epoch [3/10], Loss: 0.2562
Epoch [4/10], Loss: 0.2501
Epoch [5/10], Loss: 0.2432
Epoch [6/10], Loss: 0.2355
Epoch [7/10], Loss: 0.2328
Epoch [8/10], Loss: 0.2288
Epoch [9/10], Loss: 0.2257
Epoch [10/10], Loss: 0.2240
Training completed.


In [504]:
# Step 4.1: Test the model
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs.data, 1)

In [505]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, confusion_matrix

# Convert tensors to numpy arrays for metric calculations
predicted_np = predicted.numpy()
y_test_np = y_test_tensor.numpy()

# Accuracy
accuracy = (predicted_np == y_test_np).sum() / len(y_test_np)

# Misclassification Error
misclassification_error = 1 - accuracy

# Precision
precision = precision_score(y_test_np, predicted_np)

# Recall
recall = recall_score(y_test_np, predicted_np)

# AUC
# First, get the predicted probabilities for the positive class
probabilities = torch.nn.functional.softmax(test_outputs, dim=1)
prob_pos_class = probabilities[:, 1].numpy()
auc = roc_auc_score(y_test_np, prob_pos_class)

print(f"Misclassification Error: {misclassification_error:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUC: {auc:.4f}")

Misclassification Error: 0.1507
Accuracy: 0.8493
Precision: 0.7130
Recall: 0.6058
AUC: 0.9043
