# GDPR implementation using the Pseudonymization  and data masking

> Prerequisite:
> - moun point already created to an adls gen2 file system with the name : /mnt/gdpr
> - Download an open source dataset from Kaggle to have the user data : https://www.kaggle.com/omercolakoglu/10m-rows-fake-turkish-names-and-address-dataset  (The data is in Excel format, change the data into csv and load it in the delta table)

## Store the raw data into a delta table [gdpr.raw_customer_data] with a Pseudo Key.

### read the raw data

In [0]:
dbutils.fs.ls ("/mnt/gdpr/rawfile")

In [0]:
df_csv = spark.read.csv("dbfs:/mnt/gdpr/rawfile/Customers_1M_Rows.csv", header=True)

### Pseduonymize the email address
> - Here we are considering that email address is unquiley identify the customers, and hence Pseudo key will uniquely identify the customers as well.

In [0]:
import pyspark.sql.functions as F
df = df_csv.withColumn("customer_pseudo_id", F.sha2(F.col("email"), 256))


### write the customer table [gdpr.raw_customer_data] into the delta lake

In [0]:

spark.sql('''create schema if not exists gdpr ''')
# drop the table if it exists
dbutils.fs.rm("/mnt/gdpr/deltalake/gdpr/raw_customer_data", recurse=True)
spark.sql('''drop table if exists gdpr.raw_customer_data''')

# write the dataframe as delta 
df.write.format("delta").mode("overwrite").save("/mnt/gdpr/deltalake/gdpr/raw_customer_data")
# create the delta table
spark.sql('''create table gdpr.raw_customer_data using delta location "/mnt/gdpr/deltalake/gdpr/raw_customer_data"''')

In [0]:
%sql

select ID, Email, customer_pseudo_id from gdpr.raw_customer_data

ID,Email,customer_pseudo_id
1,mel_ozipek@fakeyahoo.com,1e42d957fb4ed5ac23fc5cc69b0790523a322b4cfcbd51028924708c15c75b3d
2,nur_zara@fakeyahoo.com,49f59d8afc59b4fded60b19eac7267a6c7496fbc990066cc3c6cb559de0e4555
3,ser_ozalvuc@fakelive.com,4666e8f6431be0c0f80a8486d253e47fd381c179478300f999708e0ce1bbc872
4,ela_cetinturk@fakelive.com,30cd5fb28bb85af594441dd67a2e016db07042d5f26cc160211b1c5f8d7b7e66
5,elm_okkaci@fakeoutlook.com,9cb099a5541148c2537d1cdac0d115fc1c112c606701f9896150877ce7fa0bdc
6,zaf_saban@fakeyahoo.com,0234634c44ccf117500bbd373d4cc21b564f8a29daad0d6871f4996ff50a6938
7,seb_ulucay@fakegmail.com,00c125855410443bf6d2a02faaf7a7d67af0b761e8c0357f460b3245052e46d1
8,cem_butev@fakeyahoo.com,2af70c693f6bde0f384580560235ac63f0c4f4759e63c1a071f7b2a4800395e2
9,ale_kuyucuoglu@fakeyahoo.com,e131a6742c2d4843266c62b2d496f73f554ed3ec34b01fe5cc6bf5820311fa3d
10,ugu_ozgazi@fakegmail.com,5e0b6d466123d7ee2431870e85f707d88e926d6627779c447596b7902c0d1dfe


## Data Masking

### Generating the Cryptographic Key

> - Run these below commands in the local environment to setup the key. The key can be generated any system. 
> - First upgrade the pip to the latest version using the command: python -m pip install --upgrade pip 
> - pip install fernet
> - pip install cryptography
> - Generate a key in the local environment 

```
from cryptography.fernet import Fernet
key = Fernet.generate_key()
```

In [0]:
# creating the user defined function to create the encryption key 
def generate_encrypt_key():
    from cryptography.fernet import Fernet
    key = Fernet.generate_key()
    return key.decode("utf-8")
spark.udf.register("generate_key_using_Fernet", generate_encrypt_key)

### gdpr.Encryption_key table keep the mapping between the customer ID and encryption keys

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
generate_key_using_Fernet = udf(generate_encrypt_key, StringType())
df_distinct_record = spark.sql('''select distinct ID from gdpr.raw_customer_data''')
df_distinct_record = df_distinct_record.withColumn("encryption_key", F.lit(generate_key_using_Fernet()))

dbutils.fs.rm("/mnt/gdpr/deltalake/gdpr/encryption_keys", recurse=True)
spark.sql('''drop table if exists gdpr.encryption_keys''')
df_distinct_record.write.format("delta").mode("overwrite").save("/mnt/gdpr/deltalake/gdpr/encryption_keys")
spark.sql('''create table gdpr.encryption_keys using delta location "/mnt/gdpr/deltalake/gdpr/encryption_keys"''')

In [0]:
%sql
select * from gdpr.encryption_keys

ID,encryption_key
296,zK9jKvKgxb5jh1ScPryLy-wpNTc_LIYwy--gomBHuOM=
467,dc0PnQBQGRQ-vGLsS3LqxsRPLyrz-W2N3L__5uZfq0o=
675,D6SaVSu4lh__khyqEE9rZ8AOTCnxpVCkjr1a0537BUw=
691,X19wBNbmJHM8JwUbzX4oPtMhtbmX_qikU1wmvmWynTs=
829,sdHT33J_24sMriLp3MsTkbcG8educGD1NtpXvni_hVo=
1090,ixa8FQZFA61hBmkWiw8TIxttQ_gHU63Vw5TTOBxLNE4=
1159,Fj5drce2NtIOIKk3ft3AKzSo9AA6Y2NUm2Ezq8_QZyo=
1436,8KdUyLQ6xqAWW5fhVqEFvJr8LqVi-yl2-LCAT_VSOLA=
1512,mUJDKmdvlqb62F0H0nH_wAgNVSzR3-hDsWyiuhiPSCo=
1572,tiq9vEnE59rUuukQtPIrP6gEjxr-UTRaBG7-D34giRA=


### create the spark UDF to encrypt and decrypt the column.

In [0]:
# Define Encrypt User Defined Function 
def encrypt_val(clear_text,MASTER_KEY):
    from cryptography.fernet import Fernet
    f = Fernet(MASTER_KEY)
    clear_text_b=bytes(clear_text, 'utf-8')
    cipher_text = f.encrypt(clear_text_b)
    cipher_text = str(cipher_text.decode('ascii'))
    return cipher_text

# Define decrypt user defined function 
def decrypt_val(cipher_text,MASTER_KEY):
    from cryptography.fernet import Fernet
    f = Fernet(MASTER_KEY)
    clear_val=f.decrypt(cipher_text.encode()).decode()
    return clear_val
spark.udf.register("decrypt_val", decrypt_val)

### Encryption

- We are going to encrypt the column Email.

In [0]:
from pyspark.sql.functions import udf, lit, md5, col
from pyspark.sql.types import StringType
 
# Register UDF's
encrypt = udf(encrypt_val, StringType())
decrypt = udf(decrypt_val, StringType())
 
 
# Encrypt the data 
df = spark.sql('''select a.*,e.encryption_key from gdpr.raw_customer_data as a 
inner join gdpr.encryption_keys as e on e.ID=a.ID''')
encrypted = df.withColumn("EMAIL", encrypt("EMAIL", col("encryption_Key"))).drop("encryption_Key")
# display(encrypted.limit(10))
 
#Save encrypted data 
encrypted.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("gdpr.raw_customer_data")

### masked data

In [0]:
%sql

select ID, Email, customer_pseudo_id from gdpr.raw_customer_data

ID,Email,customer_pseudo_id
100010,gAAAAABhK8pRcX0sMkJO9yXxsJ4lQHJ8k5DB9G4sis27h9SmvvZirqS2Av-E7gbySqS6BAXPZ4vexnWgo10fbzGUiYyhaDkpdNDDFQ5qh6g18f0L11IT4IBUMtDb1RBELqjbVpDgYCHp,ac6d8801cb69b7ea3691dc8a531117169fb5994589c9574d78db27e331e1d513
100014,gAAAAABhK8pRtrDk_fUl2qfdNiRgPshU6awBJg5zPI9gkSfigoxcSQnT_lprzqvpAQ6tFpJvEoL5-RIG4eUx4pMK0LYHL06tcRm87IJ0XguOrYbvZ00hlq4=,a5beb51bda2bd813a8b6a3d47f5b07965b5181b914235d704406e18a3942784e
100021,gAAAAABhK8pRkf4qH9Tw-hE2g268y_gRCYKSwy0_W5LoiCqmnGnJK3c3T_ZGxAOAPSfNv30tWioOZULDCIt5wJMHm6Qz1JceOK8zAIK94b7H5xkyRHdDKHI=,b307b451b3c021a44168474800498b0333a63d1b8b61b63a5da581e8d151c5a6
100062,gAAAAABhK8pRLRk6AukbeWaJ-X2itcdREBVGFOEkX0rYaU3noHBF9fraCxboLICrmYkE3vT1tCBEqc7DGyadKZNlcPqKfck6otlRdMM0mj-uIJuJ_5CXOW0=,556053a03989b6738bbc5545c8db95774ad9ee6f96c2789573d083c3452ff756
100070,gAAAAABhK8pR3gOYbrsBgN2bF6iEAzMhMp0vEUPp-eHUCTSONP8uNAdzGxY43J435gpoBCW_P4v6URwWIfF87sgl0Z7vLrfzk1D9RmJ8ZDRc_RGPipMj_3k=,b4b737890b7c2a1bfe1ec7feba3bce0a64b5247a1a69849b73fe6e0a7a67b8cd
100090,gAAAAABhK8pRqoCc4A9pMWA7kHpqhkHGnu_LMGpjCKxPcc_WLZk3h9JuZKjXJGB-0_LDkraLDXuLhV48ZeCHRobbfxOlLzUU6wtMc4_2vRkaNU6_MX71Jsg=,370217094dd3cd25f8ceaeabc0f9f7b9ef0bb35ce1abb51639754fc24b0866a8
10010,gAAAAABhK8pRs26oTfnCjfA90KopxyshOK3GSU7UOKy7ETKWiU5IW_kuBUqeMM3LPMWaaaYEOVEjl-07BJ7lQRUcgJ1UKS0sO1FVaYsY3v7GhXeCpNRTv6c=,4e8d2496b66f9a80f454bedd963351ece8534f8163587d5ac512718b0d99b6f5
100102,gAAAAABhK8pR7QZGGriB1N3y_TpAGEaISdLP6AG3dnXAdMZeGVT02QTKFRLLS9uDF0zE4M5M3C5rGtZwusgnGb1A9CTeaszjLTdrwU2-HXr9_dAELjIR0ww=,2839d1145db56bf04f9e5f9acc77b333f5a894f9bf37b9418e942f80758b36e2
10012,gAAAAABhK8pRYv8NsSMJCIOpbwb6UFLdEG9U_hZ7_whUNjZdDvseY96onvvx1lOIzGzfLXe8uWGza-vViPVamMsTMNt8qJG1e2Kv6fSFZWAg2dvSSSXGPpk=,3807c0cef8ee8a2bfb8a9ab1c2d183ce8c8e81b8676dd459345de71a95696769
100128,gAAAAABhK8pROhZPVCL89vJKD5fgOGNMsH3pAMjkP6EtiieZyVedBTxmHzOM6mJrqFKALg_daZRSjhyqA8qKFReSxU6O9d99g54ROCMrF1K5DfqNOhqSG7o=,c85d5d406e6afa403341c89b6ea59e2c451d081a64e224d187c76127a5f605cc


### Decrypt the data

#### using pyspark

In [0]:
encrypted = spark.sql('''select a.*,e.encryption_key from gdpr.raw_customer_data as a 
inner join gdpr.encryption_keys as e on e.ID=a.ID''')
decrypted = encrypted.withColumn("EMAIL", decrypt("EMAIL",(col("encryption_Key")))).drop("encryption_Key")
display(decrypted.select("ID", "EMAIL","customer_pseudo_id" ))

ID,EMAIL,customer_pseudo_id
100010,ser_turkbay@fakegmail.com@fakeyahoo.com,ac6d8801cb69b7ea3691dc8a531117169fb5994589c9574d78db27e331e1d513
100014,nis_akcealan@fakeoutlook.com,a5beb51bda2bd813a8b6a3d47f5b07965b5181b914235d704406e18a3942784e
100021,ayk_cevirme@fakelive.com,b307b451b3c021a44168474800498b0333a63d1b8b61b63a5da581e8d151c5a6
100062,erd_bayburtlu@fakeyahoo.com,556053a03989b6738bbc5545c8db95774ad9ee6f96c2789573d083c3452ff756
100070,zey_akboga@fakehotmail.com,b4b737890b7c2a1bfe1ec7feba3bce0a64b5247a1a69849b73fe6e0a7a67b8cd
100090,onu_cogalirlar@fakehotmail.com,370217094dd3cd25f8ceaeabc0f9f7b9ef0bb35ce1abb51639754fc24b0866a8
10010,sev_inan@fakehotmail.com,4e8d2496b66f9a80f454bedd963351ece8534f8163587d5ac512718b0d99b6f5
100102,tar_kirilmaz@fakeyahoo.com,2839d1145db56bf04f9e5f9acc77b333f5a894f9bf37b9418e942f80758b36e2
10012,bus_senturk@fakehotmail.com,3807c0cef8ee8a2bfb8a9ab1c2d183ce8c8e81b8676dd459345de71a95696769
100128,nur_bezci@fakeyahoo.com,c85d5d406e6afa403341c89b6ea59e2c451d081a64e224d187c76127a5f605cc


#### using databricks sql

In [0]:
%sql
select a.ID, decrypt_val(a.EMAIL,e.encryption_Key) as email, a.customer_pseudo_id
from gdpr.raw_customer_data as a 
inner join gdpr.encryption_keys as e on e.ID=a.ID

ID,email,customer_pseudo_id
100010,ser_turkbay@fakegmail.com@fakeyahoo.com,ac6d8801cb69b7ea3691dc8a531117169fb5994589c9574d78db27e331e1d513
100014,nis_akcealan@fakeoutlook.com,a5beb51bda2bd813a8b6a3d47f5b07965b5181b914235d704406e18a3942784e
100021,ayk_cevirme@fakelive.com,b307b451b3c021a44168474800498b0333a63d1b8b61b63a5da581e8d151c5a6
100062,erd_bayburtlu@fakeyahoo.com,556053a03989b6738bbc5545c8db95774ad9ee6f96c2789573d083c3452ff756
100070,zey_akboga@fakehotmail.com,b4b737890b7c2a1bfe1ec7feba3bce0a64b5247a1a69849b73fe6e0a7a67b8cd
100090,onu_cogalirlar@fakehotmail.com,370217094dd3cd25f8ceaeabc0f9f7b9ef0bb35ce1abb51639754fc24b0866a8
10010,sev_inan@fakehotmail.com,4e8d2496b66f9a80f454bedd963351ece8534f8163587d5ac512718b0d99b6f5
100102,tar_kirilmaz@fakeyahoo.com,2839d1145db56bf04f9e5f9acc77b333f5a894f9bf37b9418e942f80758b36e2
10012,bus_senturk@fakehotmail.com,3807c0cef8ee8a2bfb8a9ab1c2d183ce8c8e81b8676dd459345de71a95696769
100128,nur_bezci@fakeyahoo.com,c85d5d406e6afa403341c89b6ea59e2c451d081a64e224d187c76127a5f605cc


### Build the Hive function. 
> We would like to create a persistant view for the admin, so that they can see the actual email address whenever is required. Databricks function is a session scoped and it does not persist in multiple sessions. Due to that, we need to create a hive function to create the view.

- you can use vscode to create the hive function. 
Here is the folder structure 
```
│   build.sbt
├───src
│   └───main
│       └───scala
│               decryptUDF.scala
```

- build the scala package with ```sbt package ``` command. 
- upload the jar into the databricks cluster. 
- copy the jar path from the cluster It would be needed to register the HIVE function. 
- Install the maven package fernet for java in the databricks cluster "com.macasaet.fernet:fernet-java8:1.5.0"

In [0]:
# content of the file : decryptUDF.scala ( It is not required to run it here )
%scala

import com.macasaet.fernet.{Key, StringValidator, Token}
import org.apache.hadoop.hive.ql.exec.UDF;
import java.time.{Duration, Instant}
class Validator extends StringValidator {

  override def getTimeToLive() : java.time.temporal.TemporalAmount = {
    Duration.ofSeconds(Instant.MAX.getEpochSecond());
  }
}

class udfDecrypt extends UDF {

  def evaluate(inputVal: String, sparkKey : String): String = {

    if( inputVal != null && inputVal!="" ) {
      val keys: Key = new Key(sparkKey)
      val token = Token.fromString(inputVal)
      val validator = new Validator() {}
      val payload = token.validateAndDecrypt(keys, validator)
      payload
    } else return inputVal
  }
}

In [0]:
# content of the file build.sbt ( It is not required to trigger it here)

name := "decryptUDF"
version := "1.0"
scalaVersion := "2.12.10"
libraryDependencies += "org.apache.hive" % "hive-exec" % "0.13.1"
libraryDependencies += "com.macasaet.fernet" % "fernet-java8" % "1.5.0"

### create the viewes for the normal user and admin user. We will be segregating the access using the ACL

In [0]:
%sql
drop function if exists udfPIIDecrypt;
create function if not exists udfPIIDecrypt as 'udfDecrypt' using jar 'dbfs:/FileStore/jars/be50d23a_6c5f_4f8b_9150_5462f989342e-decryptudf_2_12_1_0-5d7b8.jar' -- the jar file location in the cluster

In [0]:
%sql
create schema if not exists gdpr_admin;
drop view if  exists gdpr_admin.Test_Encryption_PII_for_admins_v2;
create view  gdpr_admin.Test_Encryption_PII_for_admins_v2 as select a.ID, a.NAME_, a.SURNAME, a.NAMESURNAME, a.GENDER, a.BIRTHDATE, udfPIIDecrypt(a.EMAIL, e.encryption_Key) as EMAIL, a.customer_pseudo_id
from gdpr.raw_customer_data as a 
inner join gdpr.encryption_keys as e on e.ID=a.ID

In [0]:
%sql

select * from gdpr_admin.Test_Encryption_PII_for_admins_v2

ID,NAME_,SURNAME,NAMESURNAME,GENDER,BIRTHDATE,EMAIL,customer_pseudo_id
100010,Serdar,TÜRKBAY,Serdar TÜRKBAY,E,1989-09-17,ser_turkbay@fakegmail.com@fakeyahoo.com,ac6d8801cb69b7ea3691dc8a531117169fb5994589c9574d78db27e331e1d513
100014,Nisanur,AKÇEALAN,Nisanur AKÇEALAN,K,1965-02-05,nis_akcealan@fakeoutlook.com,a5beb51bda2bd813a8b6a3d47f5b07965b5181b914235d704406e18a3942784e
100021,Aykut,ÇEVİRME,Aykut ÇEVİRME,E,1985-07-31,ayk_cevirme@fakelive.com,b307b451b3c021a44168474800498b0333a63d1b8b61b63a5da581e8d151c5a6
100062,Erdal,BAYBURTLU,Erdal BAYBURTLU,E,1968-04-26,erd_bayburtlu@fakeyahoo.com,556053a03989b6738bbc5545c8db95774ad9ee6f96c2789573d083c3452ff756
100070,Zeynep Naz,AKBOĞA,Zeynep Naz AKBOĞA,K,1966-09-21,zey_akboga@fakehotmail.com,b4b737890b7c2a1bfe1ec7feba3bce0a64b5247a1a69849b73fe6e0a7a67b8cd
100090,Onur,ÇOĞALIRLAR,Onur ÇOĞALIRLAR,E,1991-03-22,onu_cogalirlar@fakehotmail.com,370217094dd3cd25f8ceaeabc0f9f7b9ef0bb35ce1abb51639754fc24b0866a8
10010,Sevil Meltem,INAN,Sevil Meltem INAN,K,1996-06-09,sev_inan@fakehotmail.com,4e8d2496b66f9a80f454bedd963351ece8534f8163587d5ac512718b0d99b6f5
100102,Tarık,KIRILMAZ,Tarık KIRILMAZ,E,1981-01-14,tar_kirilmaz@fakeyahoo.com,2839d1145db56bf04f9e5f9acc77b333f5a894f9bf37b9418e942f80758b36e2
10012,Büşra,ŞENTÜRK,Büşra ŞENTÜRK,K,1983-08-14,bus_senturk@fakehotmail.com,3807c0cef8ee8a2bfb8a9ab1c2d183ce8c8e81b8676dd459345de71a95696769
100128,Nurcan,BEZCİ,Nurcan BEZCİ,K,1958-11-19,nur_bezci@fakeyahoo.com,c85d5d406e6afa403341c89b6ea59e2c451d081a64e224d187c76127a5f605cc
