In [2]:
import numpy as np
import pandas as pd

def calculate_entropy(df, quasi_identifiers):
    eq_class_counts = df.groupby(quasi_identifiers).size()
    total_records = len(df)
    probabilities = eq_class_counts / total_records
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def calculate_granularity(df, quasi_identifiers):
    eq_class_counts = df.groupby(quasi_identifiers).size()
    total_records = len(df)
    granularity = total_records / len(eq_class_counts)
    return granularity

"""
Number of records changed
Another useful statistic is the number of records changed per variable. These can be counted in a similar way as the missing values and include suppressions (i.e., changes to missing/’NA’ in R). 
The number of records changed gives a good indication of the impact of the anonymization methods on the data. Listing 41 illustrates how to compute the number of records changed for the PRAMmed variables.
https://sdcpractice.readthedocs.io/en/latest/utility.html 
"""
def information_loss(original_df, anonymized_df, quasi_identifiers):
    changes = (original_df[quasi_identifiers] != anonymized_df[quasi_identifiers]).sum().sum()
    total_values = original_df[quasi_identifiers].size
    return changes / total_values

"""
IL1s information loss measure: only for continous variables
https://sdcpractice.readthedocs.io/en/latest/utility.html 
import numpy as np
import pandas as pd

def calculate_IL1s(original_df, anonymized_df, continuous_columns):
    n = len(original_df)  # Number of records
    p = len(continuous_columns)  # Number of continuous variables

    total_loss = 0
    for col in continuous_columns:
        std_dev = original_df[col].std()  
        if std_dev == 0:
            print("Warning: std_dev is zero")  

        # Sum of absolute differences between original and anonymized values
        col_loss = np.sum(np.abs(original_df[col] - anonymized_df[col])) / np.sqrt(2 * std_dev)
        total_loss += col_loss

    return (1 / (p * n)) * total_loss if p * n > 0 else 0  
"""

"""
Example: I don't have the anonymized data yet, but this is how it should work for entropy and granularity (I think, I havn't tested it yet)

quasi_identifiers = ['QI1', 'QI2', 'QI3']  # Adjust based on your dataset

# I'm not sure if we need the calculations for the original dataset (maybe for comparison)
original_entropy = calculate_entropy(original_df, quasi_identifiers)
original_granularity = calculate_granularity(original_df, quasi_identifiers)

# For anonymized dataset
anonymized_entropy = calculate_entropy(anonymized_df, quasi_identifiers)
anonymized_granularity = calculate_granularity(anonymized_df, quasi_identifiers)

# Print results
print(f"Original Entropy: {original_entropy:.4f}")
print(f"Anonymized Entropy: {anonymized_entropy:.4f}")
print(f"Increase in Entropy: {anonymized_entropy - original_entropy:.4f}\n")

print(f"Original Granularity: {original_granularity:.4f}")
print(f"Anonymized Granularity: {anonymized_granularity:.4f}")
print(f"Decrease in Granularity: {original_granularity - anonymized_granularity:.4f}")
"""

'\nExample: I don\'t have the anonymized data yet, but this is how it should work for entropy and granularity (I think, I havn\'t tested it yet)\n\nquasi_identifiers = [\'QI1\', \'QI2\', \'QI3\']  # Adjust based on your dataset\n\n# I\'m not sure if we need the calculations for the original dataset (maybe for comparison)\noriginal_entropy = calculate_entropy(original_df, quasi_identifiers)\noriginal_granularity = calculate_granularity(original_df, quasi_identifiers)\n\n# For anonymized dataset\nanonymized_entropy = calculate_entropy(anonymized_df, quasi_identifiers)\nanonymized_granularity = calculate_granularity(anonymized_df, quasi_identifiers)\n\n# Print results\nprint(f"Original Entropy: {original_entropy:.4f}")\nprint(f"Anonymized Entropy: {anonymized_entropy:.4f}")\nprint(f"Increase in Entropy: {anonymized_entropy - original_entropy:.4f}\n")\n\nprint(f"Original Granularity: {original_granularity:.4f}")\nprint(f"Anonymized Granularity: {anonymized_granularity:.4f}")\nprint(f"Decre

In [9]:
######################
### CREDIT DEFAULT ###
######################

### Global Transformation ###
print('Global Transformation')
print('')

# Load dataset
CCD_train = pd.read_csv('data/CCD_train_GT.csv', sep=",")
quasi_identifiers = ['LIMIT_BAL', 'EDUCATION', 'BILL_AMT1', 'PAY_AMT1']

# Entropy
entropy = calculate_entropy(CCD_train, quasi_identifiers)
print("Entropy:", entropy)

# Granularity
granularity = calculate_granularity(CCD_train, quasi_identifiers)
print("Granularity:", granularity)

### Local Transformation ###
print('')
print('Local Transformation')
print('')

# Load dataset
CCD_train = pd.read_csv('data/CCD_train_LT.csv', sep=",")

# Entropy
entropy = calculate_entropy(CCD_train, quasi_identifiers)
print("Entropy:", entropy)

# Granularity
granularity = calculate_granularity(CCD_train, quasi_identifiers)
print("Granularity:", granularity)

Global Transformation

Entropy: 5.676970518353784
Granularity: 329.8611111111111

Local Transformation

Entropy: 10.366108971551206
Granularity: 8.996212121212121


In [11]:
#####################
### CENSUS INCOME ###
#####################

### Global Transformation ###
print('Global Transformation')
print('')

# Load dataset
KDD_train = pd.read_csv('data/KDD_train_GT.csv', sep=",")
quasi_identifiers = ['age', 'marital_stat', 'birth_country_mother', 'education']

# Entropy
entropy = calculate_entropy(KDD_train, quasi_identifiers)
print("Entropy:", entropy)

# Granularity
granularity = calculate_granularity(KDD_train, quasi_identifiers)
print("Granularity:", granularity)

### Local Transformation ###
print('')
print('Local Transformation')
print('')

# Load dataset
KDD_train = pd.read_csv('data/KDD_train_LT.csv', sep=",")


# Entropy
entropy = calculate_entropy(KDD_train, quasi_identifiers)
print("Entropy:", entropy)

# Granularity
granularity = calculate_granularity(KDD_train, quasi_identifiers)
print("Granularity:", granularity)

Global Transformation

Entropy: 5.262647227928238
Granularity: 1808.2833333333333

Local Transformation

Entropy: 11.443320930352215
Granularity: 3.4214758751182592


In [12]:
###################
### PIMA INDIAN ###
###################

### Global Transformation ###
print('Global Transformation')
print('')

# Load dataset
PID_train = pd.read_csv('data/PID_train_GT.csv', sep=",")
quasi_identifiers = ['Age', 'BloodPressure', 'Insulin']

# Entropy
entropy = calculate_entropy(PID_train, quasi_identifiers)
print("Entropy:", entropy)

# Granularity
granularity = calculate_granularity(PID_train, quasi_identifiers)
print("Granularity:", granularity)

### Local Transformation ###
print('')
print('Local Transformation')
print('')

# Load dataset
PID_train = pd.read_csv('data/PID_train_LT.csv', sep=",")

# Entropy
entropy = calculate_entropy(PID_train, quasi_identifiers)
print("Entropy:", entropy)

# Granularity
granularity = calculate_granularity(PID_train, quasi_identifiers)
print("Granularity:", granularity)

Global Transformation

Entropy: 3.832967871991527
Granularity: 37.25

Local Transformation

Entropy: 6.056525146541469
Granularity: 8.895522388059701
