In [42]:
!pip install opendp



- Read the data 

In [33]:
import pandas as pd

In [43]:
df = pd.read_csv('cleaned_data.csv')

# convert necessary columns to categorical
columns_to_convert = ['GEO_PRV', 'GEODGHR4', 'DHH_SEX', 'DHHGMS', 'DHHGAGE', 'GEN_005',
       'GEN_015', 'GEN_020', 'GEN_025', 'SMK_005', 'SMK_015', 'SMK_020',
       'SMK_030']

df[columns_to_convert] = df[columns_to_convert].astype('category')

- Local DP 

In [35]:
from opendp.mod import enable_features
enable_features("contrib")

In [46]:
import numpy as np
import opendp
from typing import List, Union
from opendp.metrics import l1_distance, l2_distance
from opendp.domains import vector_domain, atom_domain
from opendp.measurements import make_base_discrete_laplace, \
                                make_base_discrete_gaussian, \
                                make_base_gaussian, \
                                make_base_laplace, \
                                make_randomized_response_bool, \
                                make_randomized_response

class LocalDifferentialPrivacy:
    def __init__(self, scale, probability):
      self.scale = scale # for laplace noise
      self.probability = probability # for randomized response

    def define_input_space(self, input_space_type: str ='vector', variable_type: str ='float'):
        """
        Define the input space for measurements.

        Parameters:
        - input_space_type: Type of input space ('scalar' or 'vector').
        - variable_type: Type of variable ('float', 'int', etc.).
        Returns:
        - input_space: Tuple specifying the input space.
        """
        
        if input_space_type == 'scalar':
            input_space = atom_domain(T=variable_type), l1_distance(T=variable_type)
        elif input_space_type == 'vector':
            input_space = vector_domain(atom_domain(T=variable_type)), l1_distance(T=variable_type)
        else:
            raise ValueError("Unsupported input type")
        return input_space

    def add_laplace_noise(self, input_space: List[Union[str, float, int]], value: str, variable_type='float'):
        """
        Add Laplace noise to the input value.

        Parameters:
        - input_space: Input space for the measurement.
        - value: The true value to which noise is added.
        - variable_type: Type of variable ('float', 'int', etc.).

        Returns:
        - noisy_value: The noisy value after adding Laplace noise.
        """
        if variable_type == 'int':
            base_discrete_lap = make_base_discrete_laplace(*input_space, scale=self.scale)
            noisy_value = base_discrete_lap(value)
        elif variable_type == 'float':
            base_lap = make_base_laplace(*input_space, scale=self.scale)
            noisy_value = base_lap(value)
        else:
            raise ValueError("Unsupported variable type")
        return noisy_value

    def randomized_response(self, true_value: List[Union[str, float, int]], variable_type: str ='bool'):
        """
        Function to add local noise depending on the type of variable
        Paramaters:
            - true_value: list of values into which noise is injected
            - variable_type: type of variable [Possible options: bool, categorical]
        Returns:
            - noisy_value: list with the noise injected
        """
        if variable_type == 'bool':
            rr_measure = make_randomized_response_bool(prob=self.probability)
        elif variable_type == 'categorical':
            rr_measure = make_randomized_response(list(set(true_value)), prob=self.probability)
        else:
            raise ValueError("Unsupported variable type")
        noisy_value = [rr_measure(value) for value in true_value]
        return noisy_value

In [None]:
- Apply Local DP to the dat

In [37]:
from tqdm import tqdm
tqdm.pandas()

def apply_local_dp(column):
    """
    Apply local differential privacy to a given column of data based on its data type.

    Parameters:
    - column: A pandas Series representing the data column.

    Returns:
    - noisy_result: The column with local differential privacy applied.
    """
    
    privacy = LocalDifferentialPrivacy(1.2, 0.50)
    
    if column.dtype=='category':
        noisy_result = privacy.randomized_response(column.tolist(), variable_type='categorical')
    elif column.dtype == 'int':
        input_space = privacy.define_input_space(input_space_type = 'vector', variable_type = 'int')
        noisy_value = privacy.add_laplace_noise(input_space, column.tolist(), variable_type = 'int')
    elif column.dtype == 'float':
        input_space = privacy.define_input_space(input_space_type = 'vector', variable_type = 'float')
        noisy_value = privacy.add_laplace_noise(input_space, column.tolist(), variable_type = 'float')
    elif column.dtype == 'bool':
        noisy_value = privacy.randomized_response(column.tolist())
    else:
        return column
    
    return noisy_result

df_transformed = df.progress_apply(apply_local_dp)

100%|██████████| 16/16 [02:54<00:00, 10.88s/it]
