## Data Anonymization

Importing required libraries

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sha2

Creating a Spark session to work with Databricks

In [0]:
spark = SparkSession.builder.appName("DataAnonymizer").getOrCreate()

Loading the input CSV file into a DataFrame

In [0]:
input_csv_path = '/FileStore/tables/personal_info.csv'

input_df = spark.read.csv(input_csv_path, header=True, inferSchema=True)
print(f"Input DataFrame schema: {input_df.schema}")

Input DataFrame schema: StructType([StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('address', StringType(), True), StructField('date_of_birth', DateType(), True)])


Anonymizing the columns using SHA-256 hashing

In [0]:
def anonymize_columns(df, columns_to_anonymize):
    new_columns = []
    for column_name in df.columns:
        if column_name in columns_to_anonymize:
            new_columns.append(sha2(col(column_name), 256).alias(column_name))
        else:
            new_columns.append(col(column_name))
    anonymized_df = df.select(*new_columns)
    return anonymized_df

Performing Data Anonymization to the input DataFrame

In [0]:
anonymization_columns = ["first_name", "last_name", "address"]

anonymized_df = anonymize_columns(input_df, anonymization_columns)
print("Data anonymization completed")

Data anonymization completed


Saving the anonymized DataFrame as a CSV file

In [0]:
output_file_path = '/FileStore/tables/anonymized_personal_info.csv'

anonymized_df.write.csv(output_file_path, header=True, mode="overwrite")
print(f"Anonymized data written to {output_file_path}")

Anonymized data written to /FileStore/tables/anonymized_personal_info.csv


Here is a sample of the anonymized DataFrame

In [0]:
display(anonymized_df.limit(25))

first_name,last_name,address,date_of_birth
1f02e917fd21530620990a81a8ddce4a8cb1f0e57770e393877e4a68a2984ff8,d9e1b51ac9805a3979ca7c91a3c612b2d5875949c994c5c0bc07947886b76eed,e4c458ae39b73eccd872fc1f49f857ff0dc527841e0b19aa7d8a845d54439e3e,1936-04-15
c10873196eb1124ed74461c20a67094e395f2310f6305607b9694ee6b1ee8b43,716545ea5827317b597b9f531b753bb931989bbe63df4307ef312fdb7374a154,1d0bc6d0b1f43d58111f7fb9f47d647ea1a6e404631279f1be79ef748ddda451,1953-07-10
eb1c72f5eb3dcae8286a919f6e9de3b0e707d30ad34551d08191e90f5080c374,818d0e5ffae3c30dfbe68dbb57896728bc1598d0094467bb378db4cb21a1774c,6063dd720b5db675d607ed7f56460d26c81407e349726c76a70f5e50afc75c1f,1965-10-25
2482517c61352f9578a40d4a8d34e5bf8509286247d32f69522d01cf9c0f24a6,fb7961d139e4da12af18c24571a166fb77c391a15abbf24d6ccf85186f441bbe,41bad2028983258e1e449786f8f62f909e603693c8a7400c93909b4a4e57e6ba,1953-03-23
6b7ee845d7c19c63f9729f3027128d6f6628dba34ec83d26e4e56b4670d7c2ff,4b48c39df4ae16d3202399ef9fbbfc4324d40440b4229893446e2ad49b2fa240,7fd63083c8e58f822822abda2c64be7f9e14e7a1b2890269c46d70d27017f208,1965-02-15
f9ae2dd68d4e4062a6e30ff5fbc773d1da24ac517de97f902ef85a918d86f05c,f58e4a64909caaa56bfb6c1f5bc5e5ffe98345f52a8233c47a7b7e1b4d9ed1ec,95e80701a725dc54d35422cb656e843c3d54a68e443bef622405944e54fa9deb,1979-08-09
5dfcf9ef1fb1ecbce32fefe37ae99aff68832a7e2ac74f52daa5a1bcd8038118,77217fd1834f951c2ac9a08ff64710a8940c3ce988fb5901b1dc85111abda3c2,ce2446cf8b35c4f4d0da5bf072696bcae56efc449b0ce322f45398d6888eeb4f,1959-03-13
893b1f8fc0fcc0587bb2f02fa8df1ca3039b9c8deae23935b6b3243e97e857a0,e20cc86f4bd18090703dd9fa530816d7a073dd7c543d259bf734a659abd58523,54eabb20e0743c61ce538d5337e93c0ed39114ee63b2e3d9a5ba9b6da4547973,1966-01-27
2fcc60c2ab3be068dc8bfa79d2431e62343435aaabb3ae8c45ae845306623b84,4d7b2cc54157ab5d6d894514924ebacdea5433898fde0ba516bc9c53e5a8f8a2,93291e3c7fd1bcd20fd2a430510ca4811a91de91ee1299f7f614654c84925d10,1956-08-11
ae42f8390d896d3e355315efe08fef67c64fdca09ec0152b88bef9499c9f3c11,502ae197407d7228d7b23234763529027e1da6a441301a6f6f405cdc62225208,e8cc9ce5089ad3b2af90e2b16647db065ad85c51dbdfb3ce0cfdbfcf9c352846,1963-10-27
