In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from spark_privacy_preserver.mondrian_preserver import Preserver

In [2]:
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [3]:
data = [[6, '1', 'test1', 'x', 20],
        [6, '1', 'test1', 'y', 30],
        [8, '2', 'test2', 'x', 50],
        [8, '2', 'test3', 'x', 45],
        [8, '1', 'test2', 'y', 35],
        [4, '2', 'test3', 'y', 20]]

cSchema = StructType([StructField("column1", IntegerType()),
                      StructField("column2", StringType()),
                      StructField("column3", StringType()),
                      StructField("column4", StringType()),
                      StructField("column5", IntegerType())])
df = spark.createDataFrame(data, schema=cSchema)
df.show()

+-------+-------+-------+-------+-------+
|column1|column2|column3|column4|column5|
+-------+-------+-------+-------+-------+
|      6|      1|  test1|      x|     20|
|      6|      1|  test1|      y|     30|
|      8|      2|  test2|      x|     50|
|      8|      2|  test3|      x|     45|
|      8|      1|  test2|      y|     35|
|      4|      2|  test3|      y|     20|
+-------+-------+-------+-------+-------+



In [4]:
#K-Anonymity 

# variables
categorical = set((
    'column2',
    'column3',
    'column4'
))
sensitive_column = 'column4'
feature_columns = ['column1', 'column2', 'column3']
schema = StructType([
    StructField("column1", StringType()),
    StructField("column2", StringType()),
    StructField("column3", StringType()),
    StructField("column4", StringType()),
    StructField("count", IntegerType()),
])
k = 2

# anonymizing
dfn = Preserver.k_anonymize(
    df, k, feature_columns, sensitive_column, categorical, schema)
dfn.show()

+-------+-------+-----------+-------+-----+
|column1|column2|    column3|column4|count|
+-------+-------+-----------+-------+-----+
|   0-10|    1,2|test3,test1|      x|    1|
|   0-10|    1,2|test3,test1|      y|    2|
|      8|    1,2|test3,test2|      x|    2|
|      8|    1,2|test3,test2|      y|    1|
+-------+-------+-----------+-------+-----+



In [5]:
#K-Anonymity without row suppresion

# variables
categorical = set((
    'column2',
    'column3',
    'column4'
))
sensitive_column = 'column4'
feature_columns = ['column2', 'column3', 'column5']
schema = StructType([
    StructField("column1", IntegerType()),
    StructField("column2", StringType()),
    StructField("column3", StringType()),
    StructField("column4", StringType()),
    StructField("column5", StringType()),
])

k = 2

# anonymizing
dfn = Preserver.k_anonymize_w_user(
    df, k, feature_columns, sensitive_column, categorical, schema)
dfn.show()

+-------+-------+-----------+-------+-------+
|column1|column2|    column3|column4|column5|
+-------+-------+-----------+-------+-------+
|      6|      1|test1,test2|      x|  20-40|
|      6|      1|test1,test2|      y|  20-40|
|      8|      1|test1,test2|      y|  20-40|
|      8|      2|test2,test3|      x|  20-55|
|      8|      2|test2,test3|      x|  20-55|
|      4|      2|test2,test3|      y|  20-55|
+-------+-------+-----------+-------+-------+



In [6]:
#Single user anonymization

# variables
categorical = set((
    'column2',
    'column3',
    'column4'
))
sensitive_column = 'column4'
schema = StructType([
    StructField("column1", StringType()),
    StructField("column2", StringType()),
    StructField("column3", StringType()),
    StructField("column4", StringType()),
    StructField("column5", StringType()),
])
user = 6
usercolumn_name = "column1"
k = 2

# anonymizing
dfn = Preserver.anonymize_user(
    df, k, user, usercolumn_name, sensitive_column, categorical, schema)
dfn.show()

+-------+-------+-----------+-------+-------+
|column1|column2|    column3|column4|column5|
+-------+-------+-----------+-------+-------+
|      6|      1|test1,test2|      x|  20-40|
|      6|      1|test1,test2|      y|  20-40|
|      8|      2|      test2|      x|     50|
|      8|      2|      test3|      x|     45|
|      8|      1|test1,test2|      y|  20-40|
|      4|      2|      test3|      y|     20|
+-------+-------+-----------+-------+-------+

