In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, substring, concat, lit
from pyspark.sql.types import StringType
import subprocess
import os

# Initialize Spark Session
spark = SparkSession.builder.appName("DataAnonymization").getOrCreate()

# Function to get file size
def get_file_size(file_path):
    if not os.path.exists(file_path):
        return 0
    result = subprocess.run(['du', '-sb', file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if result.returncode != 0:
        print(f"Error getting file size: {result.stderr.decode()}")
        return 0
    return int(result.stdout.split()[0])

# Set parameters
input_path = "randomly_generated_large_dataset"
output_path = "final_anonymized_data"
target_size = 2* 1024 * 1024 * 1024   

# Read the original data
df = spark.read.csv(input_path, header=True, inferSchema=True)

# Print schema and sample of original data
print("Original Data Schema:")
df.printSchema()
print("\nSample of Original Data:")
df.show(5, truncate=False)

# Anonymization functions
def anonymize_name(name):
    return concat(substring(name, 1, 1), lit('*' * 5))

@udf(StringType())
def anonymize_address(address):
    words = address.split()
    anonymized_words = ['XXX' if any(c.isdigit() for c in word) else word for word in words]
    return ' '.join(anonymized_words)

# Apply anonymization
anonymized_df = df.select(
    anonymize_name(col("first_name")).alias('first_name'),
    anonymize_name(col("last_name")).alias('last_name'),
    anonymize_address(col("address")).alias('address'),
    col("date_of_birth")
)

# Write anonymized data
anonymized_df.write.csv(output_path, header=True, mode="overwrite")

# Check size and print sample
current_size = get_file_size(output_path)
print(f"\nAnonymized data size: {current_size / (1024*1024*1024):.2f} GB")

print("\nSample of Anonymized Data:")
anonymized_df.show(5, truncate=False)

# Stop Spark Session
spark.stop()

Original Data Schema:
root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- date_of_birth: date (nullable = true)


Sample of Original Data:
+----------+---------+------------------------------------+-------------+
|first_name|last_name|address                             |date_of_birth|
+----------+---------+------------------------------------+-------------+
|Matthew   |Rogers   |8491 Pine Way, Salem, CA 81643      |1999-02-06   |
|Nichole   |Hobbs    |4176 Cedar Ave, Bristol, CT 81309   |1957-12-11   |
|Jeremy    |Perez    |3367 Elm Ave, Franklin, GA 47854    |1956-01-17   |
|Lori      |Nicholson|8897 Maple Ave, Greenville, CA 17563|1969-06-10   |
|David     |Martinez |5404 Elm Ave, Franklin, DE 50127    |1993-12-19   |
+----------+---------+------------------------------------+-------------+
only showing top 5 rows


Anonymized data size: 2.36 GB

Sample of Anonymized Data:
+----------+---------+---------