# This program to show Differential Privacy On Large Data Sets using PipeLine DP framework

In [23]:
import pipeline_dp
import pandas as pd
import numpy as np


In [24]:
url ="transactions.csv"
df_actual = pd.read_csv(url, sep=",")
df_actual.head()


Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS
0,0,0,2023-02-01 00:43:37,901,8047,82,2617,0
1,1,1,2023-02-01 01:20:13,2611,7777,15,4813,0
2,2,2,2023-02-01 01:22:52,4212,3336,53,4972,0
3,3,3,2023-02-01 01:26:40,1293,7432,59,5200,0
4,4,4,2023-02-01 01:52:23,2499,1024,25,6743,0


In [25]:
df_transactions = df_actual[['TRANSACTION_ID', 'TX_DATETIME','CUSTOMER_ID','TERMINAL_ID','TX_AMOUNT']]
df_transactions

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT
0,0,2023-02-01 00:43:37,901,8047,82
1,1,2023-02-01 01:20:13,2611,7777,15
2,2,2023-02-01 01:22:52,4212,3336,53
3,3,2023-02-01 01:26:40,1293,7432,59
4,4,2023-02-01 01:52:23,2499,1024,25
...,...,...,...,...,...
4557161,4557161,2023-08-02 21:46:12,1465,7455,92
4557162,4557162,2023-08-02 21:47:08,4009,3429,36
4557163,4557163,2023-08-02 21:54:43,1336,3116,50
4557164,4557164,2023-08-02 22:02:05,1611,3314,81


In [27]:
rows = [index_row[1] for index_row in df_transactions.iterrows()]



In [28]:
backend = pipeline_dp.LocalBackend()

# Define the total budget.

budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-6)


In [29]:
# Create DPEngine which will execute the logic to generate the aggrgates 

dp_engine = pipeline_dp.DPEngine(budget_accountant, backend)

# Define privacy ID, partition key and aggregated value extractors.

# The aggregated value extractor isn't used for Count aggregates, but this is required for SUM, AVERAGE aggregates

data_extractors = pipeline_dp.DataExtractors(

   partition_extractor=lambda row: row.TERMINAL_ID,

   privacy_id_extractor=lambda row: row.CUSTOMER_ID,

   value_extractor=lambda row: 1)



In [30]:
params = pipeline_dp.AggregateParams(
    
   noise_kind=pipeline_dp.NoiseKind.LAPLACE,
   metrics=[pipeline_dp.Metrics.COUNT],
   max_partitions_contributed=100,
   max_contributions_per_partition=10
)

public_partitions=list(range(1, 10000))


In [31]:
dp_result = dp_engine.aggregate(rows, params, data_extractors, public_partitions)



In [32]:
# Compute budget per each DP operation. 
budget_accountant.compute_budgets()

dp_result = list(dp_result)


In [33]:
dp_dict=dict(dp_result)

myKeys = list(dp_dict.keys())

myKeys.sort()

sorted_dict = {i: dp_dict[i] for i in myKeys}

print(sorted_dict)


{1: MetricsTuple(count=118.48999979160726), 2: MetricsTuple(count=16.687205445952713), 3: MetricsTuple(count=-1189.256161088124), 4: MetricsTuple(count=109.51437858212739), 5: MetricsTuple(count=530.1425822116435), 6: MetricsTuple(count=-489.0114804888144), 7: MetricsTuple(count=1505.3477608039975), 8: MetricsTuple(count=572.219393979758), 9: MetricsTuple(count=722.3260674001649), 10: MetricsTuple(count=37.673876294866204), 11: MetricsTuple(count=-1021.9041625093669), 12: MetricsTuple(count=1609.4134199069813), 13: MetricsTuple(count=-280.8891239585355), 14: MetricsTuple(count=960.1251835245639), 15: MetricsTuple(count=-872.6462813727558), 16: MetricsTuple(count=-1123.0868885293603), 17: MetricsTuple(count=209.760621920228), 18: MetricsTuple(count=-1712.5097409766167), 19: MetricsTuple(count=3293.805405396037), 20: MetricsTuple(count=-196.56288849189878), 21: MetricsTuple(count=1000.1331801349297), 22: MetricsTuple(count=-1167.6526009598747), 23: MetricsTuple(count=353.2611897336319), 

In [35]:
dp_count = [0] * 10000

for count_sum_per_day in dp_result:

  index =  count_sum_per_day[0] - 1

  dp_count[index] = count_sum_per_day[1][0]

  print(dp_count[index])


-1272.376904513687
1101.5968578355387
2293.159721649252
2280.815651594661
1987.1162812011316
141.9085783790797
-132.20078204851598
294.2836410552263
-1173.0265384148806
763.087991181761
371.77432238683105
-685.9651831304654
-1096.9992240117863
-1758.5769064174965
389.20835462398827
-827.0608660876751
1913.534981363453
1870.728413454257
1329.054221233353
801.1945832744241
1834.9836927270517
-2976.2979457341135
-267.274098565802
49.69204710703343
953.5467522013932
-808.3468477483839
552.7927329987288
4603.372474822216
-5311.733992012218
952.8276801751927
2146.916608181782
-961.1431468604133
-478.0440750643611
779.7445535697043
-1278.6377703258768
-2353.204825948924
219.14152970723808
-763.3589775869623
59.88892082776874
517.7424837071449
-74.57017029076815
1304.8223313242197
-393.9348194785416
-640.8766093626618
-90.82608202658594
1839.4108619438484
4211.217345412821
531.0247091827914
1247.633808452636
-2467.85060765408
1928.1028954135254
1638.053015786223
313.62346964981407
686.69185204

In [37]:
df_counts = df_transactions.groupby(by='TERMINAL_ID').agg('count')

In [38]:
df_counts

Unnamed: 0_level_0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TX_AMOUNT
TERMINAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,375,375,375,375
1,267,267,267,267
2,412,412,412,412
3,472,472,472,472
4,500,500,500,500
...,...,...,...,...
9995,348,348,348,348
9996,335,335,335,335
9997,499,499,499,499
9998,493,493,493,493
