PC Specification 
| Intel Core i7-7500U - 2.70GHz | 64bit
| 8GB Ram

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from spark_privacy_preserver.mondrian_preserver import Preserver
import time

In [2]:
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [3]:
logFile = "data/adult.all.txt"

# reading csv
df = spark.read.csv(logFile).toDF('age',
                                  'workclass',
                                  'fnlwgt',
                                  'education',
                                  'education-num',
                                  'marital-status',
                                  'occupation',
                                  'relationship',
                                  'race',
                                  'sex',
                                  'capital-gain',
                                  'capital-loss',
                                  'hours-per-week',
                                  'native-country',
                                  'income')
# variables
categorical = set((
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'sex',
    'native-country',
    'race',
    'income',
))

In [4]:
#K-Anonymity 

start = time.time()

feature_columns = ['age', 'occupation']
sensitive_column = 'income'

schema = StructType([
    StructField("age", StringType()),
    StructField("occupation", StringType()),
    StructField("income", StringType()),
    StructField("count", IntegerType())
])
k = 2

# anonymizing
dfn = Preserver.k_anonymize(
    df, k, feature_columns, sensitive_column, categorical, schema)
dfn.show()

end = time.time()

print((end - start)," seconds")

+-----+------------------+------+-----+
|  age|        occupation|income|count|
+-----+------------------+------+-----+
|15-20|   Priv-house-serv| <=50k|   10|
|15-20|      Tech-support| <=50k|    5|
|   17|                 ?| <=50k|   99|
|   17|      Adm-clerical| <=50k|   39|
|   17|      Craft-repair| <=50k|   18|
|   17|   Exec-managerial| <=50k|    3|
|   17|   Farming-fishing| <=50k|   17|
|   17| Handlers-cleaners| <=50k|   58|
|   17| Machine-op-inspct| <=50k|    3|
|   17|     Other-service| <=50k|  184|
|   17|   Priv-house-serv| <=50k|   10|
|   17|    Prof-specialty| <=50k|   16|
|   17|   Protective-serv| <=50k|    4|
|   17|             Sales| <=50k|  137|
|   17|  Transport-moving| <=50k|    6|
|   18|                 ?| <=50k|  158|
|   18|      Adm-clerical| <=50k|   83|
|   18|      Craft-repair| <=50k|   27|
|   18|   Exec-managerial| <=50k|   13|
|   18|   Farming-fishing| <=50k|   25|
+-----+------------------+------+-----+
only showing top 20 rows

35.49618148803

In [5]:
#L-Diversity

start = time.time()

feature_columns = ['age', 'occupation']
sensitive_column = 'income'

schema = StructType([
    StructField("age", StringType()),
    StructField("occupation", StringType()),
    StructField("income", StringType()),
    StructField("count", IntegerType())
])
k = 2
l = 2

# anonymizing
dfn = Preserver.l_diversity(
    df, k,l, feature_columns, sensitive_column, categorical, schema)
dfn.show()

end = time.time()

print((end - start)," seconds")

+-----+--------------------+------+-----+
|  age|          occupation|income|count|
+-----+--------------------+------+-----+
|15-20|                   ?| <=50k|  439|
|15-20|                   ?|  >50k|    1|
|15-20|       Other-service| <=50k|  652|
|15-20|       Other-service|  >50k|    1|
|15-20|      Prof-specialty| <=50k|   53|
|15-20|      Prof-specialty|  >50k|    1|
|15-25|        Adm-clerical| <=50k|  594|
|15-25|        Adm-clerical|  >50k|    2|
|15-25|        Craft-repair| <=50k|  470|
|15-25|        Craft-repair|  >50k|    6|
|15-25|     Exec-managerial| <=50k|  248|
|15-25|     Exec-managerial|  >50k|    1|
|15-25| Farming-fishing,...| <=50k|  380|
|15-25| Farming-fishing,...|  >50k|    5|
|15-25|   Machine-op-inspct| <=50k|  365|
|15-25|   Machine-op-inspct|  >50k|    2|
|15-25|     Protective-serv| <=50k|   46|
|15-25|     Protective-serv|  >50k|    1|
|15-30|   Handlers-cleaners| <=50k|  764|
|15-30|   Handlers-cleaners|  >50k|    3|
+-----+--------------------+------

In [6]:
#T-Closeness

start = time.time()

feature_columns = ['age', 'occupation']
sensitive_column = 'income'

schema = StructType([
    StructField("age", StringType()),
    StructField("occupation", StringType()),
    StructField("income", StringType()),
    StructField("count", IntegerType())
])
k = 2
t = 0.2

# anonymizing
dfn = Preserver.t_closeness(
    df, k,t, feature_columns, sensitive_column, categorical, schema)
dfn.show()

end = time.time()

print((end - start)," seconds")

+-----+--------------------+------+-----+
|  age|          occupation|income|count|
+-----+--------------------+------+-----+
|15-30| Farming-fishing,...| <=50k|  760|
|15-30| Farming-fishing,...|  >50k|   37|
|15-30| Machine-op-inspc...| <=50k| 1918|
|15-30| Machine-op-inspc...|  >50k|   81|
|15-30|      Prof-specialty| <=50k|  568|
|15-30|      Prof-specialty|  >50k|   35|
|15-40| ?, Priv-house-se...| <=50k| 3049|
|15-40| ?, Priv-house-se...|  >50k|  679|
|15-40|        Adm-clerical| <=50k| 2772|
|15-40|        Adm-clerical|  >50k|  195|
|15-40| Handlers-cleaner...| <=50k| 2017|
|15-40| Handlers-cleaner...|  >50k|  151|
|15-40| Protective-serv,...| <=50k| 3271|
|15-40| Protective-serv,...|  >50k|  171|
|15-40|               Sales| <=50k| 2486|
|15-40|               Sales|  >50k|  412|
|25-35|     Farming-fishing| <=50k|  129|
|25-35|     Farming-fishing|  >50k|   12|
|   26|      Prof-specialty| <=50k|  128|
|   26|      Prof-specialty|  >50k|   19|
+-----+--------------------+------

In [7]:
logFile = "data/calls.csv"
df = spark.read.csv(logFile,header= True)
categorical = set((
    'user', 
    'other', 
    'direction', 
    'timestamp'
))

In [8]:
#K-Anonymize without row suppresion

start = time.time()

feature_columns = ['user', 'other','direction']
sensitive_column = 'timestamp'

schema = StructType([
    StructField("user", StringType()),
    StructField("other", StringType()),
    StructField("direction", StringType()),
    StructField("duration", DoubleType()),
    StructField("timestamp", StringType()),
    # StructField("count", IntegerType())
])

k = 2

# anonymizing
dfn = Preserver.k_anonymize_w_user(
    df, k, feature_columns, sensitive_column, categorical, schema)
dfn.show()

end = time.time()

print((end - start)," seconds")

+-----------+-----------+---------+--------+--------------------+
|       user|      other|direction|duration|           timestamp|
+-----------+-----------+---------+--------+--------------------+
|07086312446|01304635046| Incoming|    23.0|Mon Dec 06 11:03:...|
|07086312446|01304635046| Incoming|    30.0|Mon Dec 13 14:33:...|
|07086312446|01304635046| Incoming|   114.0|Mon Dec 20 14:51:...|
|07086312446|01304635046| Incoming|    40.0|Mon Dec 20 16:19:...|
|07086312446|01304635046| Incoming|   105.0|Mon Nov 08 11:53:...|
|07086312446|01304635046| Incoming|    31.0|Mon Nov 15 12:50:...|
|07086312446|01304635046| Incoming|    45.0|Mon Nov 22 15:43:...|
|07086312446|01304635046| Incoming|   101.0|Thu Dec 16 13:20:...|
|07086312446|01304635046| Incoming|     7.0|Thu Dec 16 16:22:...|
|07086312446|01304635046| Incoming|    43.0|Thu Jan 27 12:47:...|
|07086312446|01304635046| Incoming|    46.0|Thu Jan 27 12:52:...|
|07086312446|01304635046| Incoming|    34.0|Thu Nov 04 14:53:...|
|070863124

In [9]:
#L-diversty without row suppresion

start = time.time()

feature_columns = ['user', 'other','direction']
sensitive_column = 'timestamp'

schema = StructType([
    StructField("user", StringType()),
    StructField("other", StringType()),
    StructField("direction", StringType()),
    StructField("duration", DoubleType()),
    StructField("timestamp", StringType()),
    # StructField("count", IntegerType())
])

k = 2
l =2 

# anonymizing
dfn = Preserver.l_diversity_w_user(
    df, k,l, feature_columns, sensitive_column, categorical, schema)
dfn.show()

end = time.time()

print((end - start)," seconds")

+-----------+-----------+---------+--------+--------------------+
|       user|      other|direction|duration|           timestamp|
+-----------+-----------+---------+--------+--------------------+
|07086312446|01304635046| Incoming|    23.0|Mon Dec 06 11:03:...|
|07086312446|01304635046| Incoming|    30.0|Mon Dec 13 14:33:...|
|07086312446|01304635046| Incoming|   114.0|Mon Dec 20 14:51:...|
|07086312446|01304635046| Incoming|    40.0|Mon Dec 20 16:19:...|
|07086312446|01304635046| Incoming|   105.0|Mon Nov 08 11:53:...|
|07086312446|01304635046| Incoming|    31.0|Mon Nov 15 12:50:...|
|07086312446|01304635046| Incoming|    45.0|Mon Nov 22 15:43:...|
|07086312446|01304635046| Incoming|   101.0|Thu Dec 16 13:20:...|
|07086312446|01304635046| Incoming|     7.0|Thu Dec 16 16:22:...|
|07086312446|01304635046| Incoming|    43.0|Thu Jan 27 12:47:...|
|07086312446|01304635046| Incoming|    46.0|Thu Jan 27 12:52:...|
|07086312446|01304635046| Incoming|    34.0|Thu Nov 04 14:53:...|
|070863124

In [10]:
#T-closeness without row suppresion

start = time.time()

feature_columns = ['user', 'other','direction']
sensitive_column = 'timestamp'

schema = StructType([
    StructField("user", StringType()),
    StructField("other", StringType()),
    StructField("direction", StringType()),
    StructField("duration", DoubleType()),
    StructField("timestamp", StringType()),
    # StructField("count", IntegerType())
])

k = 2
t = 0.2 

# anonymizing
dfn = Preserver.l_diversity_w_user(
    df, k,t, feature_columns, sensitive_column, categorical, schema)
dfn.show()

end = time.time()

print((end - start)," seconds")

+-----------+-----------+---------+--------+--------------------+
|       user|      other|direction|duration|           timestamp|
+-----------+-----------+---------+--------+--------------------+
|07086312446|01304635046| Incoming|    23.0|Mon Dec 06 11:03:...|
|07086312446|01304635046| Incoming|    30.0|Mon Dec 13 14:33:...|
|07086312446|01304635046| Incoming|   114.0|Mon Dec 20 14:51:...|
|07086312446|01304635046| Incoming|    40.0|Mon Dec 20 16:19:...|
|07086312446|01304635046| Incoming|   105.0|Mon Nov 08 11:53:...|
|07086312446|01304635046| Incoming|    31.0|Mon Nov 15 12:50:...|
|07086312446|01304635046| Incoming|    45.0|Mon Nov 22 15:43:...|
|07086312446|01304635046| Incoming|   101.0|Thu Dec 16 13:20:...|
|07086312446|01304635046| Incoming|     7.0|Thu Dec 16 16:22:...|
|07086312446|01304635046| Incoming|    43.0|Thu Jan 27 12:47:...|
|07086312446|01304635046| Incoming|    46.0|Thu Jan 27 12:52:...|
|07086312446|01304635046| Incoming|    34.0|Thu Nov 04 14:53:...|
|070863124

In [11]:
#Single user anonymization

start = time.time()

feature_columns = ['user', 'other','direction']
sensitive_column = 'timestamp'

schema = StructType([
    StructField("user", StringType()),
    StructField("other", StringType()),
    StructField("direction", StringType()),
    StructField("duration", StringType()),
    StructField("timestamp", StringType()),
    # StructField("count", IntegerType())
])

user = '07641036117'
usercolumn_name = "user"
k = 3

# anonymizing
dfn = Preserver.anonymize_user(df, k, user, usercolumn_name, sensitive_column, categorical,schema)
dfn.show()

end = time.time()

print((end - start)," seconds")

+-----------+--------------------+--------------------+--------+--------------------+
|       user|               other|           direction|duration|           timestamp|
+-----------+--------------------+--------------------+--------+--------------------+
|07610039694|         07434677419|            Incoming|     211|Wed Sep 15 19:17:...|
|07641036117|01666472054,07371...|Outgoing,Incoming...|    0-50|Mon Feb 11 07:18:...|
|07641036117|01666472054,07371...|Outgoing,Incoming...|    0-50|Mon Feb 11 07:45:...|
|07641036117|01666472054,07371...|Outgoing,Incoming...|    0-50|Mon Feb 11 08:04:...|
|07641036117|01666472054,07371...|Outgoing,Incoming...|    0-50|Mon Feb 11 08:05:...|
|07641036117|01666472054,07371...|Outgoing,Incoming...|    0-50|Mon Feb 11 08:06:...|
|07641036117|01666472054,07371...|Outgoing,Incoming...|    0-50|Mon Feb 11 08:06:...|
|07641036117|01666472054,07371...|Outgoing,Incoming...|    0-50|Thu Sep 09 19:35:...|
|07981267897|         07784425582|            Outgoing