In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from scipy.stats import entropy

In [None]:
class DataAnonymizer:
    def __init__(self, df):
        self.df = df

    def k_anonymity(self, quasi_identifiers, k=2):
        grouped_df = self.df.groupBy(quasi_identifiers).count()
        valid_groups = grouped_df.filter(F.col('count') >= k)
        self.df = self.df.join(valid_groups.select(quasi_identifiers), on=quasi_identifiers, how='inner')
        
    def l_diversity(self, quasi_identifiers, sensitive_column, l=2):
        grouped_df = self.df.groupBy(quasi_identifiers + [sensitive_column]).count()
        diversity_count = grouped_df.groupBy(quasi_identifiers).agg(F.countDistinct(sensitive_column).alias('l_diversity'))
        valid_groups = diversity_count.filter(F.col('l_diversity') >= l)
        self.df = self.df.join(valid_groups.select(quasi_identifiers), on=quasi_identifiers, how='inner')
        
    def t_closeness(self, quasi_identifiers, sensitive_column, t=0.2):
        total_count = self.df.count()
        overall_dist = self.df.groupBy(sensitive_column).count().withColumn('global_prob', F.col('count') / total_count)

        def calculate_entropy(group):
            local_count = group.count()
            local_dist = group.groupBy(sensitive_column).count().withColumn('local_prob', F.col('count') / local_count)
            joined_dist = local_dist.join(overall_dist, on=sensitive_column)
            joined_dist = joined_dist.withColumn('abs_diff', F.abs(joined_dist.local_prob - joined_dist.global_prob))
            max_diff = joined_dist.agg(F.max('abs_diff')).collect()[0][0]
            return max_diff <= t

        grouped_df = self.df.groupBy(quasi_identifiers)
        valid_groups = [key for key, group in grouped_df if calculate_entropy(group)]
        self.df = self.df.filter(F.col(quasi_identifiers[0]).isin([x[0] for x in valid_groups]))

# Initialize Spark Session

In [None]:
spark = SparkSession.builder \
    .appName("Anonymization Techniques") \
    .getOrCreate()

# Sample DataFrame

In [None]:
data = [("Alice", 25, "Engineer"),
        ("Bob", 25, "Doctor"),
        ("Catherine", 30, "Engineer"),
        ("David", 30, "Engineer"),
        ("Eva", 35, "Engineer"),
        ("Frank", 35, "Doctor")]

In [None]:
# Sample DataFrame
data = [("Alice", 25, "Engineer"),
        ("Bob", 25, "Doctor"),
        ("Catherine", 30, "Engineer"),
        ("David", 30, "Engineer"),
        ("Eva", 35, "Engineer"),
        ("Frank", 35, "Doctor")]

df = spark.createDataFrame(data, ["Name", "Age", "Occupation"])

# Anonymize Data
anonymizer = DataAnonymizer(df)

# Apply K-Anonymity
anonymizer.k_anonymity(["Age"], k=2)

# Apply L-Diversity
anonymizer.l_diversity(["Age"], "Occupation", l=2)

# Apply T-Closeness (here assuming a threshold of 0.2, modify as needed)
anonymizer.t_closeness(["Age"], "Occupation", t=0.2)

# Show the anonymized DataFrame
anonymizer.df.show()