In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import PandasUDFType, lit, pandas_udf
from pyspark.sql import SparkSession
from spark_privacy_preserver.clustering_preserver import Preserver
import pandas as pd

In [2]:
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

data = [[6, '1', 'test1', 'x', 20],
        [6, '1', 'test1', 'y', 30],
        [8, '2', 'test2', 'x', 50],
        [8, '2', 'test2', 'x', 45],
        [4, '1', 'test2', 'y', 35],
        [4, '2', 'test3', 'y', 20]]

cSchema = StructType([StructField("column1", IntegerType()),
                      StructField("column2", StringType()),
                      StructField("column3", StringType()),
                      StructField("column4", StringType()),
                      StructField("column5", IntegerType())])
df = spark.createDataFrame(data, schema=cSchema)
categorical = set((
    'column2',
    'column3',
    'column4'
))

In [3]:
QI = ['column1', 'column2', 'column3']
CI = [1, 2]
SA = ['column4']
schema = StructType([
    StructField("column1", StringType()),
    StructField("column2", StringType()),
    StructField("column3", StringType()),
    StructField("column4", StringType()),
])

In [4]:
k_df = Preserver.k_anonymize(
        df, schema, QI, SA, CI, k=2, mode='', center_type='fbcg', return_mode='Not_equal', iter=1)
k_df.show()

+-------+-------+-----------------+-------+
|column1|column2|          column3|column4|
+-------+-------+-----------------+-------+
|    4-8|    1,2|test1,test2,test3|      x|
|    4-8|    1,2|test1,test2,test3|      y|
|    4-8|    1,2|test1,test2,test3|      x|
|    4-8|    1,2|test1,test2,test3|      x|
|    4-8|    1,2|test1,test2,test3|      y|
|    4-8|    1,2|test1,test2,test3|      y|
+-------+-------+-----------------+-------+



In [5]:
l_df = Preserver.l_diverse(k_df,schema, QI, SA, write_to_file=False, l=2)
l_df.show()

+-------+-------+-----------------+-------+
|column1|column2|          column3|column4|
+-------+-------+-----------------+-------+
|    4-8|    1,2|test1,test2,test3|      x|
|    4-8|    1,2|test1,test2,test3|      y|
|    4-8|    1,2|test1,test2,test3|      x|
|    4-8|    1,2|test1,test2,test3|      x|
|    4-8|    1,2|test1,test2,test3|      y|
|    4-8|    1,2|test1,test2,test3|      y|
+-------+-------+-----------------+-------+



In [6]:
    t_df = Preserver.t_closer(
        k_df,schema, QI, SA, t=0.3, write_to_file=False, verbose=1)
    t_df.show()

+-------+-------+-----------------+-------+
|column1|column2|          column3|column4|
+-------+-------+-----------------+-------+
|    4-6|    1,2|test1,test2,test3|      x|
|    4-6|    1,2|test1,test2,test3|      y|
|      8|      2|            test2|      x|
|      8|      2|            test2|      x|
|    4-6|    1,2|test1,test2,test3|      y|
|    4-6|    1,2|test1,test2,test3|      y|
+-------+-------+-----------------+-------+

