In [2]:
import random
import pandas as pd
import numpy as np 
import os 
import string
from more_itertools import random_permutation
import pickle
import traceback

In [4]:

data_path = "/mnt/d/query_to_insight/data/masked_data/masked data backup/25apr"


keys_outpath = './keys_to_decode'
outpath = '../../../data/masked_data/'

os.makedirs(keys_outpath,exist_ok=True)
os.makedirs(outpath,exist_ok=True)

In [4]:
def shuffle(list_to_shuffle):
    unique_entires = list_to_shuffle
    # shifting the unique_entries by 5 places
    encrypted_vals = unique_entires[-5:]+unique_entires[:-5]
    out = dict(zip(unique_entires,encrypted_vals))
    return out


def encrypt(df, cols_to_mask, numeric_random_mask=False, seed=10, mask_dict=False):
    try:
        if not mask_dict:
            mask_dict = {}
        
        # for numeric cols like IDs
        if numeric_random_mask:
            if not mask_dict:
                digits = '0123456789'
                if bool(seed):
                    random.seed(seed)
                # mask by random shuffle of digits
                key = ''.join(random_permutation(digits))
                for i in range(len(digits)):
                    mask_dict[digits[i]] = key[i]
            
            for col in cols_to_mask:
                df[col] = pd.to_numeric(df[col],errors='coerce').fillna(0).astype('int64')
                df[col] = df[col].astype(str)
                df[col] = df[col].apply(lambda word: ''.join([mask_dict[l] for l in word]))
        
        # for categoric cols like locations
        else:
            if not mask_dict:
                for key in list(cols_to_mask.keys()):
                    if key!='location':
                        mask_dict[key] = shuffle(list(df[cols_to_mask[key]].melt()['value'].unique()))
                    if key=='location':
                        priority1_cols = cols_to_mask[key]['p1']
                        unique_entires_p1 = list(df[priority1_cols].melt()['value'].unique())
                        shuffled_p1_dict = shuffle(unique_entires_p1)
                        
                        priority2_cols = cols_to_mask[key]['p2']
                        unique_entires_p2 = list(df[priority2_cols].melt()['value'].unique())
                        unique_entires_p2 = list(set(unique_entires_p2)-set(unique_entires_p1))
                        shuffled_p2_dict = shuffle(unique_entires_p2)
                        
                        # appending 2 dicts
                        mask_dict[key] = {**shuffled_p2_dict,**shuffled_p1_dict}
            
            for key in list(cols_to_mask.keys()):
                if key!='location':
                    for col in cols_to_mask[key]:
                        df[col] = df[col].map(mask_dict[key])
                else:
                    cols = sum([v for k,v in cols_to_mask[key].items()], [])
                    for col in cols:
                        df[col] = df[col].map(mask_dict[key])
    except Exception as e:
        print('keyerror: ',e)
        print(traceback.format_exc())
    return df, mask_dict



def decrypt(df, cols_to_mask, mask_dict, numeric_random_mask=False):
    decode = {}
    
    # for numeric cols like IDs
    if numeric_random_mask:
        decode = {v:k for k,v in mask_dict.items()}
        for col in cols_to_mask:
            df[col] = pd.to_numeric(df[col],errors='coerce').fillna(0).astype('int64')
            df[col] = df[col].astype(str)
            df[col] = df[col].apply(lambda word: ''.join([decode[l] for l in word]))
    
    # for categoric cols like locations
    else:
        for key in list(cols_to_mask.keys()):
            decode[key] = {v:k for k,v in mask_dict[key].items()}
            for col in cols_to_mask[key]:
                df[col] = df[col].map(decode[key])
    return df

In [5]:
x = {'a': 1, 'b': 2}
y = {'b': 3, 'c': 4}

{**x,**y}

{'a': 1, 'b': 3, 'c': 4}

In [7]:
stream_data = pd.read_csv(data_path)

cols_to_mask = {'location':{'p1':['source_location_name',  'pick_locationname'], 
                            'p2':['destination_location_name', 'drop_locationname']}, 
                'carrier':['carrier_name']}

num_cols_to_mask = ['dps_tm_load_id','dps_tripid']


temp = stream_data.copy()
masked_stream_data,mask_dict = encrypt(temp.copy(), cols_to_mask)

with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'wb') as f:
    pickle.dump(mask_dict, f)

masked_stream_data_,num_mask_dict = encrypt(masked_stream_data, num_cols_to_mask, numeric_random_mask=True)

with open(os.path.join(keys_outpath,'num_cols.pkl'), 'wb') as f:
    pickle.dump(mask_dict, f)

for col in ['destination_location_name', 'source_location_name','pick_locationname','drop_locationname','carrier_name','sap_material_description']:
    if col in masked_stream_data_.columns:
        masked_stream_data_[col] = masked_stream_data_[col].astype(str).apply(lambda x: x.replace('FRITO LAY', 'CHIPS_'))
        masked_stream_data_[col] = masked_stream_data_[col].astype(str).apply(lambda x: x.replace('FRITOLAY', 'CHIPS'))
        masked_stream_data_[col] = masked_stream_data_[col].astype(str).apply(lambda x: x.replace('FRITO', 'CHIP'))
        
    

masked_stream_data_.to_excel(os.path.join(outpath,'masked_stream_data.xlsx'), index=False)

In [7]:
invoice_data_file = "masked_invoice_data_11apr.csv"
# prod_data_file = "masked_product_data_11apr.csv"
# desc_data_file = "material_descriptions_masked.csv"

invoice_data = pd.read_csv(os.path.join(data_path, invoice_data_file))
# prod_data = pd.read_csv(os.path.join(data_path, prod_data_file))
# desc_data = pd.read_csv(os.path.join(data_path, desc_data_file))


# location masking
unique_vals = invoice_data[['source_location_name', 'destination_location_name']].melt()['value'].unique()
masked_val = [f"Location {x}" for x in range(len(unique_vals))]
location_masking_dict = dict(zip(unique_vals, masked_val))

# carrier_name masking
unique_vals = invoice_data['carrier_name'].unique()
masked_val = [f"Carrier {x}" for x in range(len(unique_vals))]
carrier_masking_dict = dict(zip(unique_vals, masked_val))

# masking
invoice_data['source_location_name'] = invoice_data['source_location_name'].map(location_masking_dict)
invoice_data['destination_location_name'] = invoice_data['destination_location_name'].map(location_masking_dict)
invoice_data['carrier_name'] = invoice_data['carrier_name'].map(carrier_masking_dict)

invoice_data.to_csv(os.path.join(outpath, invoice_data_file), index=False)

pd.DataFrame(location_masking_dict, index=['masked_value']).T.reset_index().to_csv(os.path.join(keys_outpath, "location_masking_keys.csv"), index=False)
pd.DataFrame(carrier_masking_dict, index=['masked_value']).T.reset_index().to_csv(os.path.join(keys_outpath, "carrier_masking_keys.csv"), index=False)


invoice_data[['destination_location_name', 'source_location_name','carrier_name']].head()


Unnamed: 0,destination_location_name,source_location_name,carrier_name
0,Location 96,Location 0,Carrier 0
1,Location 29,Location 1,Carrier 1
2,Location 97,Location 2,Carrier 2
3,Location 98,Location 2,Carrier 2
4,Location 99,Location 0,Carrier 0


Update mask dict with model ready data

In [15]:
def add_new_entried_to_maskdict(old_df, new_df, mask_dict, cols_to_mask):
    for key in list(cols_to_mask.keys()):
        if key=='location':
            priority1_cols = cols_to_mask[key]['p1']
            old_unique_entires_p1 = list(old_df[priority1_cols].melt()['value'].dropna().unique())
            new_unique_entires_p1 = list(new_df[priority1_cols].melt()['value'].dropna().unique())
            newly_added_p1 = list(set(new_unique_entires_p1)-set(old_unique_entires_p1))
            shuffled_newly_added_p1 = shuffle(newly_added_p1)
            
            priority2_cols = cols_to_mask[key]['p2']
            old_unique_entires_p2 = list(old_df[priority2_cols].melt()['value'].dropna().unique())
            new_unique_entires_p2 = list(new_df[priority2_cols].melt()['value'].dropna().unique())
            newly_added_p2 = list(set(new_unique_entires_p2)-set(old_unique_entires_p2))
            newly_added_p2 = list(set(newly_added_p2)-set(newly_added_p1))
            shuffled_newly_added_p2 = shuffle(newly_added_p2)
            
            # appending 2 dicts
            appended = {**shuffled_newly_added_p2,**shuffled_newly_added_p1}
            mask_dict[key] = {**appended,**mask_dict[key]}
        else:
            new_unique_entires = list(new_df[cols_to_mask[key]].melt()['value'].dropna().unique())
            old_unique_entires = list(mask_dict[key].keys())
            newly_added = list(set(new_unique_entires)-set(old_unique_entires))
            shuffled_newly_added = shuffle(newly_added)
            mask_dict[key] = {**shuffled_newly_added,**mask_dict[key]}

    
    return mask_dict


In [39]:
model_data = pd.read_csv(model_ready_data)
total_trip_data = pd.read_excel(transport, sheet_name="Total_Trip_Data")

num_cols_to_mask = ['dps_tm_load_id','dps_tripid']
cols_to_mask = {'location':{'p1':['source_location_name',  'pick_locationname'], 
                            'p2':['destination_location_name', 'drop_locationname']}, 
                'carrier':['carrier_name']}


with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'rb') as f:
    mask_dict = pickle.load(f)
with open(os.path.join(keys_outpath,'num_cols.pkl'), 'rb') as f:
    num_mask_dict = pickle.load(f)

new_mask_dict = add_new_entried_to_maskdict(total_trip_data, model_data, mask_dict=mask_dict.copy(), cols_to_mask=cols_to_mask)


with open(os.path.join(keys_outpath,'cat_cols_with_model_data.pkl'), 'wb') as f:
    pickle.dump(mask_dict, f)
    
for key in list(cols_to_mask.keys()):
    if key!='location':
        for col in cols_to_mask[key]:
            model_data[col] = model_data[col].map(new_mask_dict[key])
    else:
        cols = sum([v for k,v in cols_to_mask[key].items()], [])
        for col in cols:
            model_data[col] = model_data[col].map(new_mask_dict[key])

model_data,num_mask_dict = encrypt(model_data, num_cols_to_mask, mask_dict=num_mask_dict, numeric_random_mask=True)
    
model_data.to_csv(os.path.join(outpath,'masked_model_data.csv'), index=False)


  total_trip_data = pd.read_excel(transport, sheet_name="Total_Trip_Data")


In [40]:
len(new_mask_dict['location'].keys()),len(mask_dict['location'].keys())

(659, 647)

In [41]:
aa = pd.DataFrame(new_mask_dict['location'], index=[0]).T.merge(pd.DataFrame(mask_dict['location'], index=[1]).T,left_index=True, right_index=True, how='left')
aa[aa[0]!=aa[1]]

Unnamed: 0,0,1
FRITO LAY DC BATON ROUGE DC550,SCOTT DEPOT DC,
FRITO LAY DC SCOTT DEPOT,MENASHA,
MOUNTAIN HOME ID RELAY,FRITO LAY DC BATON ROUGE DC550,
SCOTT DEPOT DC,FRANKFORT PLANT,
FRITO LAY DC TULSA,FRITO LAY DC SCOTT DEPOT,
MENASHA,FRITO LAY PL FRANKFORT CORE IN,
TULSA DC,WESTROCK CP LLC,
SOUTH BATON ROUGE DC550,MOUNTAIN HOME ID RELAY,
FRITO LAY CP TECH ECOMM,BELTON DC,
ALLEN SW CUSTOM,FRITO LAY RP DGN MARKETING,


Testing mask_dict

In [11]:
priority1_cols = cols_to_mask['location']['p1']
unique_entires_p1 = list(masked_total_trip_data_[priority1_cols].melt()['value'].unique())

priority2_cols = cols_to_mask['location']['p2']
unique_entires_p2 = list(masked_total_trip_data_[priority2_cols].melt()['value'].unique())
unique_entires_p2 = list(set(unique_entires_p2)-set(unique_entires_p1))

with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'rb') as f:
    mask_dict = pickle.load(f)
    
c = 0
for k,v in mask_dict['location'].items():
    if v in (unique_entires_p1):
        if k in (unique_entires_p2):
            c=c+1
    if v in (unique_entires_p2):
        if k in (unique_entires_p1):
            c=c+1
    
c     

0

In [14]:
sample_completed_trips = pd.read_excel(sample_trips)

cols_to_mask = {'location':{'p1':['source_location_name',  'pick_locationname'], 
                            'p2':['destination_location_name', 'drop_locationname']}, 
                'carrier':['carrier_name']}

with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'rb') as f:
    mask_dict = pickle.load(f)

masked_sample_completed_trips,temp = encrypt(sample_completed_trips, cols_to_mask, mask_dict=mask_dict)
masked_sample_completed_trips_,mask_dict = encrypt(masked_sample_completed_trips, num_cols_to_mask, numeric_random_mask=True)
masked_sample_completed_trips_.to_excel(os.path.join(outpath,'masked_sample_completed_trips.xlsx'), index=False)

In [15]:
material_data = pd.read_excel(transport, sheet_name="Material_Data")

with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'rb') as f:
    mask_dict = pickle.load(f)

cols_to_mask = {'carrier':['carrier_name']}
num_cols_to_mask = ['sap_tm_load_id']

masked_material_data,temp = encrypt(material_data, cols_to_mask, mask_dict=mask_dict)
masked_material_data_,mask_dict = encrypt(masked_material_data, num_cols_to_mask, numeric_random_mask=True)
masked_material_data_.to_excel(os.path.join(outpath,'masked_material_data.xlsx'), index=False)

Test 

Decrypting the masked data

In [12]:
test_dec = pd.read_excel("masked_data\masked_total_trip_data.xlsx")

# cols_to_mask = {'carrier':['carrier_name']}
# num_cols_to_mask = ['sap_tm_load_id']
cols_to_mask = {'location':['source_location_name', 'destination_location_name', 'pick_locationname', 'drop_locationname'], 
                'carrier':['carrier_name']}
num_cols_to_mask = ['dps_tm_load_id','dps_tripid']

with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'rb') as f:
    mask_dict = pickle.load(f)
test_dec_ = decrypt(test_dec, cols_to_mask, mask_dict, numeric_random_mask=False)

with open(os.path.join(keys_outpath,'num_cols.pkl'), 'rb') as f:
    num_mask_dict = pickle.load(f)
test_dec__ = decrypt(test_dec_, num_cols_to_mask, num_mask_dict, numeric_random_mask=True)

test_dec__.to_excel(os.path.join(outpath,'test_masked_total_trip_data.xlsx'), index=False)

In [17]:
test_dec = pd.read_excel("masked_data\masked_sample_completed_trips.xlsx")

# cols_to_mask = {'carrier':['carrier_name']}
# num_cols_to_mask = ['sap_tm_load_id']
cols_to_mask = {'location':['source_location_name', 'destination_location_name', 'pick_locationname', 'drop_locationname'], 
                'carrier':['carrier_name']}
num_cols_to_mask = ['dps_tm_load_id','dps_tripid']

with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'rb') as f:
    mask_dict = pickle.load(f)
test_dec_ = decrypt(test_dec, cols_to_mask, mask_dict, numeric_random_mask=False)

with open(os.path.join(keys_outpath,'num_cols.pkl'), 'rb') as f:
    num_mask_dict = pickle.load(f)
test_dec__ = decrypt(test_dec_, num_cols_to_mask, num_mask_dict, numeric_random_mask=True)

test_dec__.to_excel(os.path.join(outpath,'test_masked_sample_completed_trips.xlsx'), index=False)

In [None]:
test_dec = pd.read_excel("masked_data\masked_material_data.xlsx")

cols_to_mask = {'carrier':['carrier_name']}
num_cols_to_mask = ['sap_tm_load_id']

with open(os.path.join(keys_outpath,'cat_cols.pkl'), 'rb') as f:
    mask_dict = pickle.load(f)
test_dec_ = decrypt(test_dec, cols_to_mask, mask_dict, numeric_random_mask=False)

with open(os.path.join(keys_outpath,'num_cols.pkl'), 'rb') as f:
    num_mask_dict = pickle.load(f)
test_dec__ = decrypt(test_dec_, num_cols_to_mask, num_mask_dict, numeric_random_mask=True)

test_dec__.to_excel(os.path.join(outpath,'test_masked_material_data.xlsx'), index=False)