In [1]:
import pydp as dp  # by convention our package is to be imported as dp (for Differential Privacy!)
from pydp.algorithms.laplacian import BoundedSum, BoundedMean, Count, Max
import pandas as pd
import statistics  # for calculating mean without applying differential privacy

In [2]:
url = "transactions.csv"
df_actual = pd.read_csv(url, sep=",")

df = df_actual[['TRANSACTION_ID', 'TX_DATETIME','CUSTOMER_ID','TERMINAL_ID','TX_AMOUNT']]

df.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT
0,0,2023-02-01 00:43:37,901,8047,82
1,1,2023-02-01 01:20:13,2611,7777,15
2,2,2023-02-01 01:22:52,4212,3336,53
3,3,2023-02-01 01:26:40,1293,7432,59
4,4,2023-02-01 01:52:23,2499,1024,25


In [3]:
def mean_tx_amount(tid) -> float:
    dft = df[df["TERMINAL_ID"] == tid] 
    return statistics.mean(list(dft["TX_AMOUNT"]))

mean_tx_amount(1)

56.22097378277154

In [4]:
# calculates mean applying differential privacy
def private_mean_tx_amount(privacy_budget: float, tid) -> float:
    x = BoundedMean(privacy_budget,1,1,300)
    dft = df[df["TERMINAL_ID"] == id] 
    return x.quick_result(list(dft["TX_AMOUNT"]))

private_mean_tx_amount(0.6,1)

220.98103940917645

In [5]:
terminal_mean_vs_privacy_means=[]
for i in range (1, 100):
    mean = mean_tx_amount(i)
    privacy_mean = private_mean_tx_amount(0.9,i)
    terminal_mean_vs_privacy_means.append([i, mean,privacy_mean])
        

terminal_mean_vs_privacy_means_df = pd.DataFrame(terminal_mean_vs_privacy_means, columns=['Terminal Id','Mean','privacy_mean'])
terminal_mean_vs_privacy_means_df.head(10)

Unnamed: 0,Terminal Id,Mean,privacy_mean
0,1,56.220974,96.594534
1,2,52.366505,300.0
2,3,40.870763,1.0
3,4,52.102,300.0
4,5,47.911175,118.168421
5,6,52.302128,1.0
6,7,51.335979,186.967339
7,8,50.100806,163.890127
8,9,52.631893,1.0
9,10,48.978903,1.0


In [9]:
def count_tx_amount_above(limit: float,tid) -> int:
    dft = df[df["TERMINAL_ID"] == tid] 
    return dft[dft.TX_AMOUNT > limit].count()[0]

count_tx_amount_above(25.0,1)

232

In [14]:
def private_tx_amount_above(privacy_budget: float, limit: float,tid:int) -> float:
    dft = df[df["TERMINAL_ID"] == tid] 
    x = Count(privacy_budget, dtype="float")
    return x.quick_result(list(dft[dft.TX_AMOUNT > limit]["TX_AMOUNT"]))

private_tx_amount_above(0.1,25.0,1)

257

In [15]:
terminal_amount_vs_privacy_amont=[]
for i in range (1, 100):
    count = count_tx_amount_above(25.0,i)
    privacy_count = private_tx_amount_above(0.1,25.0,i)
    terminal_amount_vs_privacy_amont.append([i, count,privacy_count])
        

terminal_amount_vs_privacy_amont_df = pd.DataFrame(terminal_amount_vs_privacy_amont, columns=['Terminal Id','Count','privacy_count'])
terminal_amount_vs_privacy_amont_df.head(10)

#print("Above 100:\t" + str(count_tx_amount_above(40)))
#print("private tx_amount above:\t" + str(private_tx_amount_above(0.5, 40.0)))

Unnamed: 0,Terminal Id,Count,privacy_count
0,1,232,224
1,2,285,274
2,3,381,368
3,4,338,315
4,5,279,278
5,6,337,329
6,7,267,270
7,8,318,323
8,9,514,502
9,10,370,364


In [16]:
def max_tx_amount(tid:int) -> int:
    dft = df[df["TERMINAL_ID"] == tid] 
    return dft.max()["TX_AMOUNT"]

max_tx_amount(1)

87

In [18]:
def private_max_tx_amount(privacy_budget: float,tid:int) -> int:
    dft = df[df["TERMINAL_ID"] == tid] 
    x = Max(epsilon = privacy_budget, lower_bound = 100.0, upper_bound = 50000.0, dtype="float")
    return x.quick_result(list(dft["TX_AMOUNT"]))

private_max_tx_amount(0.5,1)

167.51941105013407

In [19]:
def sum_tx_amount(tid:int) -> float:
    dft = df[df["TERMINAL_ID"] == tid] 
    return dft.sum()["TX_AMOUNT"]
sum_tx_amount(1)

15011

In [75]:
def private_sum_tx_amount(privacy_budget: float, tid:int) -> float:
    dft = df[df["TERMINAL_ID"] == tid] 
    x = BoundedSum(epsilon = privacy_budget, delta = 0, lower_bound= 100.0, upper_bound = 50000.0, dtype="float")
    return x.quick_result(list(dft["TX_AMOUNT"]))

private_sum_tx_amount(0.6,1)

27759.46144104004

In [92]:
print("Sum:\t" + str(sum_tx_amount(2)))
print("Private Sum:\t" + str(private_sum_tx_amount(0.9,2)))

Sum:	62
Private Sum:	59321.32064503431


In [93]:
###. Membership inference Attack - Example

url = "2023-07-08.csv"
df = pd.read_csv(url, sep=",")

In [102]:
redact_dataset = df.copy()
redact_dataset = redact_dataset[1:]     

In [103]:
df.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS
0,3909703,3909703,2023-07-08 00:16:41,2482,833,13,13565801,157
1,3909704,3909704,2023-07-08 01:14:55,1622,8781,90,13569295,157
2,3909705,3909705,2023-07-08 01:27:59,2507,4365,65,13570079,157
3,3909706,3909706,2023-07-08 01:37:14,2253,8773,97,13570634,157
4,3909707,3909707,2023-07-08 01:42:52,4866,7578,41,13570972,157


In [104]:
redact_dataset.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS
1,3909704,3909704,2023-07-08 01:14:55,1622,8781,90,13569295,157
2,3909705,3909705,2023-07-08 01:27:59,2507,4365,65,13570079,157
3,3909706,3909706,2023-07-08 01:37:14,2253,8773,97,13570634,157
4,3909707,3909707,2023-07-08 01:42:52,4866,7578,41,13570972,157
5,3909708,3909708,2023-07-08 01:53:29,1584,7270,83,13571609,157


In [105]:
sum_original_dataset = sum(df["TX_AMOUNT"].to_list())
sum_redact_dataset = sum(redact_dataset["TX_AMOUNT"].to_list())
tx_amount_3909703 = sum_original_dataset - sum_redact_dataset
tx_amount_3909703

13

In [106]:
df.iloc[0, 5]
assert tx_amount_3909703 == df.iloc[0, 5]

In [107]:
dp_sum_original_dataset = BoundedSum(
    epsilon=1, lower_bound=1.0, upper_bound=500.0, dtype="float"
)
     

In [108]:
dp_sum_original_dataset.reset()
dp_sum_original_dataset.add_entries(
    df["TX_AMOUNT"].to_list()
)  

In [109]:
dp_sum_og = round(dp_sum_original_dataset.result(), 2)
print(dp_sum_og)

1265685.96


In [110]:
dp_redact_dataset = BoundedSum(epsilon=1, lower_bound=1.0, upper_bound=500.0, dtype="float")
dp_redact_dataset.add_entries(redact_dataset["TX_AMOUNT"].to_list())

In [111]:
dp_redact_dataset.memory_used()

160

In [112]:
dp_sum_redact = round(dp_redact_dataset.result(), 2)
print(dp_sum_redact)

1265276.4


In [113]:
round(dp_sum_og - dp_sum_redact, 2)

409.56

In [55]:
print("Difference in sum using DP: {}".format(round(dp_sum_og - dp_sum_redact, 2)))
print("Actual Value: {}".format(tx_amount_3909703))
assert round(dp_sum_og - dp_sum_redact, 2) != tx_amount_3909703

Difference in sum using DP: -736.35
Actual Value: 13


In [56]:
print("Sum of sales_value in the orignal Dataset: {}".format(sum_original_dataset))
print("Sum of sales_value in the orignal Dataset using DP: {}".format(dp_sum_og))
assert dp_sum_og != sum_original_dataset

Sum of sales_value in the orignal Dataset: 1266138
Sum of sales_value in the orignal Dataset using DP: 1265437.28


In [57]:
print("Sum of sales_value in the redacted Dataset: {}".format(sum_redact_dataset))
print("Sum of sales_value in the redacted Dataset using DP: {}".format(dp_sum_redact))
assert dp_sum_redact != sum_redact_dataset

Sum of sales_value in the redacted Dataset: 1266125
Sum of sales_value in the redacted Dataset using DP: 1266173.63


In [58]:
partial_dp_obj = BoundedSum(epsilon=1, lower_bound=5, upper_bound=250, dtype="float")
     

In [59]:
#### Querying on partial data

In [60]:
partial_dp_obj = BoundedSum(epsilon=1, lower_bound=5, upper_bound=250, dtype="float")

In [61]:
df

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS
0,3909703,3909703,2023-07-08 00:16:41,2482,833,13,13565801,157
1,3909704,3909704,2023-07-08 01:14:55,1622,8781,90,13569295,157
2,3909705,3909705,2023-07-08 01:27:59,2507,4365,65,13570079,157
3,3909706,3909706,2023-07-08 01:37:14,2253,8773,97,13570634,157
4,3909707,3909707,2023-07-08 01:42:52,4866,7578,41,13570972,157
...,...,...,...,...,...,...,...,...
24898,3934601,3934601,2023-07-08 22:23:41,1629,7424,84,13645421,157
24899,3934602,3934602,2023-07-08 22:28:29,3911,3970,33,13645709,157
24900,3934603,3934603,2023-07-08 22:36:58,4823,7011,94,13646218,157
24901,3934604,3934604,2023-07-08 22:40:40,1004,2173,32,13646440,157


In [62]:
new_df_1 = df[0:12451]
new_df_2 = df[12452:]
print(new_df_1.shape, new_df_2.shape)

(12451, 8) (12451, 8)


In [63]:
new_df_1.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS
0,3909703,3909703,2023-07-08 00:16:41,2482,833,13,13565801,157
1,3909704,3909704,2023-07-08 01:14:55,1622,8781,90,13569295,157
2,3909705,3909705,2023-07-08 01:27:59,2507,4365,65,13570079,157
3,3909706,3909706,2023-07-08 01:37:14,2253,8773,97,13570634,157
4,3909707,3909707,2023-07-08 01:42:52,4866,7578,41,13570972,157


In [64]:
new_df_2.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS
12452,3922155,3922155,2023-07-08 12:00:50,1942,9916,56,13608050,157
12453,3922156,3922156,2023-07-08 12:00:50,4334,8027,9,13608050,157
12454,3922157,3922157,2023-07-08 12:00:53,2923,7734,12,13608053,157
12455,3922158,3922158,2023-07-08 12:00:55,2674,4593,65,13608055,157
12456,3922159,3922159,2023-07-08 12:00:56,1002,4058,7,13608056,157


In [65]:
partial_dp_obj.add_entries(
    new_df_1["TX_AMOUNT"].to_list()
)  

In [66]:
partial_dp_obj.privacy_budget_left()

1.0

In [67]:
partial_sum_dp = round(
    partial_dp_obj.result(privacy_budget=0.3), 2
)  # using only 30% of available privacy budget
print(partial_sum_dp)

637338.39


In [68]:
actual_partial_sum = round(sum(new_df_1["TX_AMOUNT"].to_list()), 2)
print(actual_partial_sum)

637049


In [69]:
print(
    "Difference in sum for first 3000 records which used only 30% privacy budget= {}".format(
        round(abs(actual_partial_sum - partial_sum_dp), 2)
    )
)

Difference in sum for first 3000 records which used only 30% privacy budget= 289.39


In [70]:
partial_dp_obj.privacy_budget_left()

0.7

In [71]:
partial_dp_obj.add_entries(
    new_df_2["TX_AMOUNT"].to_list()
)  # adding the remaining 2000 records to the list
partial_total_sum = round(partial_dp_obj.result(), 2)
print(partial_total_sum)

1266407.83


In [72]:
partial_dp_obj.privacy_budget_left()  # we have used up all the budget available to us
     

0.0

In [73]:
def sum_og_dataset(budget):
    """
    Sample Function to calculate BoundedSum on the whole dataset with budget as specified
    """
    dp_sum_original_dataset.reset()
    dp_sum_original_dataset.add_entries(df["TX_AMOUNT"].to_list())
    return round(dp_sum_original_dataset.result(budget), 2)

In [74]:
print("Actual Sum: {}".format(sum_original_dataset))
print("Sum from the previous run with privacy budget 1.0: {}".format(dp_sum_og))
print(
    "Sum when using privacy_budget as 0.7 on the whole dataset together: {}".format(
        sum_og_dataset(budget=0.7)
    )
)
print(
    "Sum from this run with privacy budget 0.7 on split dataset: {}".format(
        partial_total_sum
    )
)
     

Actual Sum: 1266138
Sum from the previous run with privacy budget 1.0: 1265437.28
Sum when using privacy_budget as 0.7 on the whole dataset together: 1266013.37
Sum from this run with privacy budget 0.7 on split dataset: 1266407.83
