In [None]:
!pip install opendp

- Read the data 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_data.csv')

# convert necessary columns to categorical
columns_to_convert = ['GEO_PRV', 'GEODGHR4', 'DHH_SEX', 'DHHGMS', 'DHHGAGE', 'GEN_005',
       'GEN_015', 'GEN_020', 'GEN_025', 'SMK_005', 'SMK_015', 'SMK_020',
       'SMK_030']

df[columns_to_convert] = df[columns_to_convert].astype('category')

- Local DP 

In [3]:
from opendp.mod import enable_features
enable_features("contrib")

In [4]:
import numpy as np
import opendp
from typing import List, Union
from opendp.metrics import l1_distance, l2_distance
from opendp.domains import vector_domain, atom_domain
from opendp.measurements import make_base_discrete_laplace, \
                                make_base_discrete_gaussian, \
                                make_base_gaussian, \
                                make_base_laplace, \
                                make_randomized_response_bool, \
                                make_randomized_response

class LocalDifferentialPrivacy:
    def __init__(self, scale, probability):
      self.scale = scale # for laplace noise
      self.probability = probability # for randomized response

    def define_input_space(self, input_space_type: str ='vector', variable_type: str ='float'):
        """
        Define the input space for measurements.

        Parameters:
        - input_space_type: Type of input space ('scalar' or 'vector').
        - variable_type: Type of variable ('float', 'int', etc.).
        Returns:
        - input_space: Tuple specifying the input space.
        """
        
        if input_space_type == 'scalar':
            input_space = atom_domain(T=variable_type), l1_distance(T=variable_type)
        elif input_space_type == 'vector':
            input_space = vector_domain(atom_domain(T=variable_type)), l1_distance(T=variable_type)
        else:
            raise ValueError("Unsupported input type")
        return input_space

    def add_laplace_noise(self, input_space: List[Union[str, float, int]], value: str, variable_type='float'):
        """
        Add Laplace noise to the input value.

        Parameters:
        - input_space: Input space for the measurement.
        - value: The true value to which noise is added.
        - variable_type: Type of variable ('float', 'int', etc.).

        Returns:
        - noisy_value: The noisy value after adding Laplace noise.
        """
        if variable_type == 'int':
            base_discrete_lap = make_base_discrete_laplace(*input_space, scale=self.scale)
            noisy_value = base_discrete_lap(value)
        elif variable_type == 'float':
            base_lap = make_base_laplace(*input_space, scale=self.scale)
            noisy_value = base_lap(value)
        else:
            raise ValueError("Unsupported variable type")
        return noisy_value

    def randomized_response(self, true_value: List[Union[str, float, int]], variable_type: str ='bool'):
        """
        Function to add local noise depending on the type of variable
        Paramaters:
            - true_value: list of values into which noise is injected
            - variable_type: type of variable [Possible options: bool, categorical]
        Returns:
            - noisy_value: list with the noise injected
        """
        if variable_type == 'bool':
            rr_measure = make_randomized_response_bool(prob=self.probability)
        elif variable_type == 'categorical':
            rr_measure = make_randomized_response(list(set(true_value)), prob=self.probability)
        else:
            raise ValueError("Unsupported variable type")
        noisy_value = [rr_measure(value) for value in true_value]
        return noisy_value

- Apply Local DP to the PUMF dataset

In [6]:
from tqdm import tqdm
tqdm.pandas()

def apply_local_dp(column):
    """
    Apply local differential privacy to a given column of data based on its data type.

    Parameters:
    - column: A pandas Series representing the data column.

    Returns:
    - noisy_result: The column with local differential privacy applied.
    """
    
    privacy = LocalDifferentialPrivacy(1.2, 0.50)
    
    if column.dtype=='category':
        noisy_result = privacy.randomized_response(column.tolist(), variable_type='categorical')
    elif column.dtype == 'int':
        input_space = privacy.define_input_space(input_space_type = 'vector', variable_type = 'int')
        noisy_result = privacy.add_laplace_noise(input_space, column.tolist(), variable_type = 'int')
    elif column.dtype == 'float':
        input_space = privacy.define_input_space(input_space_type = 'vector', variable_type = 'float')
        noisy_result = privacy.add_laplace_noise(input_space, column.tolist(), variable_type = 'float')
    elif column.dtype == 'bool':
        noisy_result = privacy.randomized_response(column.tolist())
    else:
        return column
    
    return noisy_result

df_transformed = df.progress_apply(apply_local_dp)

100%|██████████| 16/16 [05:26<00:00, 20.43s/it]


In [7]:
df.head(10)

Unnamed: 0,ID,GEO_PRV,GEODGHR4,DHH_SEX,DHHGMS,DHHGAGE,GEN_005,GEN_015,GEN_020,GEN_025,SMK_005,SMK_015,SMK_020,SMK_030,HWTDGHTM,HWTDGWTK
0,0,2,37,0,4,0,6,7,0,8,2,3,0,5,1.651,74.25
1,1,0,77,1,4,14,4,3,3,8,2,3,0,5,1.727,108.0
2,2,3,25,0,1,9,6,3,5,2,2,3,2,4,1.6,60.75
3,3,1,68,0,4,10,6,7,3,5,2,3,2,4,1.676,81.0
4,4,8,46,0,1,9,1,1,0,0,2,3,2,4,1.753,63.0
5,5,10,47,0,0,6,1,1,3,8,3,3,0,1,1.727,67.5
6,6,2,64,1,0,14,2,3,0,0,2,3,2,4,1.727,81.0
7,7,10,3,0,0,9,6,7,4,0,2,3,2,4,1.626,58.5
8,8,8,28,0,0,6,6,7,0,6,3,1,0,5,1.727,63.0
9,9,8,61,0,1,10,6,3,0,0,2,3,0,5,1.626,62.1


In [8]:
df_transformed.head(10)

Unnamed: 0,ID,GEO_PRV,GEODGHR4,DHH_SEX,DHHGMS,DHHGAGE,GEN_005,GEN_015,GEN_020,GEN_025,SMK_005,SMK_015,SMK_020,SMK_030,HWTDGHTM,HWTDGWTK
0,0,2,54,0,1,0,6,3,0,4,2,3,0,5,3.00482,76.699913
1,0,0,14,1,1,14,3,5,3,1,0,3,2,1,2.167687,109.170289
2,2,3,7,1,0,9,6,3,1,2,1,3,1,5,2.02799,58.717283
3,3,1,68,1,4,3,5,5,2,5,3,4,2,4,10.072377,79.69966
4,4,8,46,0,2,12,1,6,0,0,2,4,2,4,3.143938,63.268844
5,5,11,58,1,0,13,1,0,3,8,3,3,1,4,4.018417,65.84607
6,4,2,64,0,0,7,2,3,0,8,2,3,3,4,2.30302,77.902553
7,7,10,3,0,0,9,4,6,4,0,2,4,2,4,1.241321,59.050213
8,9,11,58,0,0,6,6,4,0,0,3,2,1,5,-0.323259,65.084638
9,9,3,3,0,1,13,0,2,4,0,2,3,2,2,1.714376,61.979884


- Include class description 
- apply_local_dp within the class
- Processing time
- 