In [11]:

import pandas as pd
import pyzipper
import time
import threading
import getpass
import os
import hashlib
import random
import string
from typing import List


class Zipndel:
    
    def __init__(self, file_name='df', file_format='csv', self_destruct_time=(672, 0, 0), password=None, encryption_algorithm='AES', mask_columns=None, anonymize_columns=None, compliance_check=False, audit_trail=False):
        self.file_name = file_name
        self.file_format = file_format
        self.self_destruct_time = self_destruct_time
        self.password = password
        self.encryption_algorithm = encryption_algorithm
        self.mask_columns = mask_columns
        self.anonymize_columns = anonymize_columns
        self.compliance_check = compliance_check
        self.audit_trail = audit_trail

    def zipit(self, df: pd.DataFrame) -> None:
        if self.mask_columns is not None:
            df = self._mask_columns(df, self.mask_columns)

        if self.anonymize_columns is not None:
            df = self._anonymize_columns(df, self.anonymize_columns)

        write_func = getattr(df, f'to_{self.file_format}')
        write_func(self.file_name, index=False)

        df_zip = f"{self.file_name}.zip"
        with pyzipper.AESZipFile(df_zip, 'w', compression=pyzipper.ZIP_DEFLATED, encryption=getattr(pyzipper, f'WZ_{self.encryption_algorithm}')) as zf:
            if self.password is None:
                self.password = getpass.getpass('Enter password: ')
            zf.setpassword(self.password.encode('utf-8'))
            zf.write(self.file_name)

        os.remove(self.file_name)

        if self.self_destruct_time and self.self_destruct_time != False:
            t = threading.Thread(target=self._self_destruct, args=self.self_destruct_time)
            t.start()

    def _mask_columns(self, df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
        df = df.copy()
        for col in columns:
            df[col] = df[col].apply(
                lambda x: hashlib.sha256(str(x).encode()).hexdigest())
        return df

    def _anonymize_columns(self, df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
        df = df.copy()
        for col in columns:
            df[col] = df[col].apply(lambda x: ''.join(
                random.choices(string.ascii_uppercase + string.digits, k=10)))
        return df

    def _self_destruct(self, hours: int, minutes: int, seconds: int) -> None:
        df_zip = f"{self.file_name}.zip"
        self_destruct_time = time.time() + hours * 60 * 60 + minutes * 60 + seconds
        while True:
            if time.time() > self_destruct_time:
                os.remove(df_zip)
                break
            time.sleep(5)


class Unzipndel:
    def __init__(self, file_name='df', file_format='csv'):
        self.file_name = file_name
        self.file_format = file_format

    def unzipit(self):
        password = getpass.getpass('Password: ')
        with pyzipper.AESZipFile(f"{self.file_name}.zip") as zf:
            zf.setpassword(password.encode())
            zf.extract(self.file_name)

        read_func = getattr(pd, f'read_{self.file_format}')
        df = read_func(self.file_name)

        os.remove(self.file_name)

        return df

In [14]:
Seperate and modify the functions to make zipminator used as a module imported to the main module zipit.py such that we get: 
zipminator/zipit.py, zipminator/mask_columns.py, zipminator/anonymize_columns.py, zipminator/self_destruct.py, 
zipminator/audit_trail.py, zipminator/compliance_check.py, zipminator/file_name.py, zipminator/file_format.py

# zipminator/compliance_check.py
class ComplianceCheck:
    def init(self, rules=None):
        self.rules = [] if rules is None else rules

    def add_rule(self, rule):
        self.rules.append(rule)

    def check(self, df):
        for rule in self.rules:
            if not rule(df):
                return False

# zipminator/audit_trail.py
class AuditTrail:
    def init(self):
        self.audit_trail = []

    def add_log(self, log):
        self.audit_trail.append(log)

    def save_logs(self, file_name):
        with open(file_name, 'w') as f:
            for log in self.audit_trail:
                f.write(log + '\n')

# zipminator/zipit.py

import pandas as pd
import pyzipper
import time
import threading
import getpass
import os
import hashlib
import random
import string
from typing import List


class Zipndel:
    """Class for compressing and encrypting Pandas DataFrames and deleting the original file.

    Attributes:
        file_name (str): The name of the file to be written, default is 'df'.
        file_format (str): The file format of the file to be written, default is 'csv'.
        self_destruct_time (tuple): A tuple of (hours, minutes, seconds) until self-destruct, default is (672, 0, 0).
        password (str): The password to use for the zip file, default is None.
        encryption_algorithm (str): The encryption algorithm to use for the zip file, default is 'AES'.
        mask_columns (list): The list of columns to mask, default is None.
        anonymize_columns (list): The list of columns to anonymize, default is None.
        compliance_check (bool): Whether to perform a compliance check on the data, default is False.
        audit_trail (bool): Whether to keep an audit trail, default is False.

    Methods:
        mask_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
            Mask sensitive data in the specified DataFrame columns by applying a SHA-256 hash function.

        anonymize_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
            Anonymize sensitive data in the specified DataFrame columns by replacing it with random characters.

        zipit(df: pd.DataFrame) -> None:
            Write the input DataFrame to a file, create a zip file with the written file, set a password for the zip file,
            and delete the written file.

        self_destruct(hours: int, minutes: int, seconds: int) -> None:
            Delete the compressed and encrypted file after a specified amount of time.

        decompress_and_read() -> pd.DataFrame:
            Unzip the file, read it using pandas, and delete the unzipped file.

    Example:
        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
        >>> Zipndel(file_name='my_file', password='my_password', mask_columns=['B'], anonymize_columns=['C']).zipit(df)
    """
    def __init__(self, file_name='df', file_format='csv', self_destruct_time=(672, 0, 0), password=None, encryption_algorithm='AES', mask_columns=None, anonymize_columns=None, compliance_check=False, audit_trail=False):
        self.file_name = file_name
        self.file_format = file_format
        self.self_destruct_time = self_destruct_time
        self.password = password
        self.encryption_algorithm = encryption_algorithm
        self.mask_columns = mask_columns
        self.anonymize_columns = anonymize_columns
        self.compliance_check = compliance_check
        self.audit_trail = audit_trail
        
        """
        Initialize the Zipndel object.

        Args:
            file_name (str): The name of the file to be written, default is 'df'.
            file_format (str): The file format of the file to be written, default is 'csv'.
            self_destruct_time (tuple): A tuple of (hours, minutes, seconds) until self-destruct, default is (672, 0, 0).
            password (str): The password to use for the zip file, default is None.
            encryption_algorithm (str): The encryption algorithm to use for the zip file, default is 'AES'.
            mask_columns (list): The list of columns to mask, default is None.
            anonymize_columns (list): The list of columns to anonymize, default is None.
            compliance_check (bool): Whether to perform a compliance check on the data, default is False.
            audit_trail (bool): Whether to keep an audit trail, default is False.
        """

    def zipit(self, df: pd.DataFrame) -> None:
        """
        Write the input DataFrame to a file, create a zip file with the written file, set a password for the zip file,
        and delete the written file.

        Args:
            df (pandas.DataFrame): The DataFrame to compress and encrypt.

        Returns:
            None

        Raises:
            None
        """
        if self.mask_columns is not None:
            df = self._mask_columns(df, self.mask_columns)

        if self.anonymize_columns is not None:
            df = self._anonymize_columns(df, self.anonymize_columns)

        write_func = getattr(df, f'to_{self.file_format}')
        write_func(self.file_name, index=False)

        df_zip = f"{self.file_name}.zip"
        with pyzipper.AESZipFile(df_zip, 'w', compression=pyzipper.ZIP_DEFLATED, encryption=getattr(pyzipper, f'WZ_{self.encryption_algorithm}')) as zf:
            if self.password is None:
                self.password = getpass.getpass('Enter password: ')
            zf.setpassword(self.password.encode('utf-8'))
            zf.write(self.file_name)

        os.remove(self.file_name)

        if self.self_destruct_time and self.self_destruct_time != False:
            t = threading.Thread(target=self._self_destruct, args=self.self_destruct_time)
            t.start()

    def _mask_columns(self, df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
        """Mask sensitive data in the specified DataFrame columns by applying a SHA-256 hash function.

        Args:
            df (pandas.DataFrame): The DataFrame to mask sensitive data in.
            columns (list): A list of strings specifying the names of the columns to mask.

        Returns:
            pandas.DataFrame: A copy of the input DataFrame with the specified columns masked.
        """
        df = df.copy()
        for col in columns:
            df[col] = df[col].apply(
                lambda x: hashlib.sha256(str(x).encode()).hexdigest())
        return df

    def _anonymize_columns(self, df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
        """Anonymize sensitive data in the specified DataFrame columns by replacing it with random characters.

        Args:
            df (pandas.DataFrame): The DataFrame to anonymize sensitive data in.
            columns (list): A list of strings specifying the names of the columns to anonymize.

        Returns:
            pandas.DataFrame: A copy of the input DataFrame with the specified columns anonymized.
        """
        df = df.copy()
        for col in columns:
            df[col] = df[col].apply(lambda x: ''.join(
                random.choices(string.ascii_uppercase + string.digits, k=10)))
        return df

    def _self_destruct(self, hours: int, minutes: int, seconds: int) -> None:
        """Delete the compressed and encrypted file after a specified amount of time has elapsed.

        Args:
            hours (int): The number of hours until file deletion.
            minutes (int): The number of minutes until file deletion.
            seconds (int): The number of seconds until file deletion.

        Returns:
        None
        """
        df_zip = f"{self.file_name}.zip"
        self_destruct_time = time.time() + hours * 60 * 60 + minutes * 60 + seconds
        while True:
            if time.time() > self_destruct_time:
                os.remove(df_zip)
                break
            time.sleep(5)

# zipminator/unzipit.py

class Unzipndel:
    
    def __init__(self, file_name='df', file_format='csv'):
        self.file_name = file_name
        self.file_format = file_format

    def unzipit(self):
        """Unzip the file, read it using pandas, and delete the unzipped file.

        Returns:
            pd.DataFrame: A pandas dataframe containing the unzipped and read data.

        Raises:
            RuntimeError: If the password is incorrect or the file cannot be unzipped.
        """
        password = getpass.getpass('Password: ')
        with pyzipper.AESZipFile(f"{self.file_name}.zip") as zf:
            zf.setpassword(password.encode())
            zf.extract(self.file_name)

        read_func = getattr(pd, f'read_{self.file_format}')
        df = read_func(self.file_name)

        os.remove(self.file_name)

        return df

Zipminator is a versatile and powerful Python package designed to simplify and streamline file compression and decompression tasks, while also enhancing security and privacy protection. With Zipminator, users have a wide range of advanced features at their disposal, including multiple encryption algorithms such as AES, Blowfish, and RSA, as well as masking capabilities to hide sensitive information in files. In addition, the package provides anonymization techniques, including hashing, pseudonymization, and data suppression, to remove personal identifying information from files.

Zipminator is user-friendly, efficient, and highly customizable, making it suitable for a wide range of use cases. Originally intended for - and written by data scientists in The Norwegian Labour and Welfare Administration NAV, however nonetheless useful for developers and IT professionals dealing with large volumes of data that need to be compressed and transmitted efficiently Zipminator is also useful for anyone who needs to send or receive large files over email or other file-sharing services. The package includes compliance checks to ensure that files comply with GDPR, CCPA, and HIPAA regulations. It scans files for personal data and flags any data that may violate regulations, and an audit trail is kept to track who has accessed compressed files and when.

Zipminator offers a range of features that simplify and streamline file compression and decompression tasks. Its capabilities include creating zip files from single or multiple files or directories, extracting files from zip files, and deleting extracted files after use. Password protection for zip files ensures that sensitive data is kept secure during transmission or storage.

Zipminator is a powerful and flexible tool for protecting sensitive data and ensuring compliance with data privacy regulations. The package can be used as a Python library or from the command line, making it easy to integrate with other Python projects or automate the compression and decompression of files. It supports various compression algorithms, including ZIP_DEFLATED, LZMA, and BZIP2, and multiple encryption algorithms, including AES, Blowfish, and RSA.

Zipminator includes features for masking sensitive columns, enabling users to hide specific information in a file, such as social security numbers, email addresses, or phone numbers. The package also supports anonymization of sensitive data, allowing users to remove personal identifying information from a file using techniques such as hashing, pseudonymization, and data suppression.

Finally, Zipminator includes a self-destruct feature that automatically deletes compressed files after a specified period. This ensures that files are only available for a limited time, providing an extra layer of security.

SyntaxError: invalid syntax (3647192573.py, line 1)

In [12]:
import pandas as pd

# lager en DataFrame med info om ansatte
df = pd.DataFrame({
    'ident': [150706, 150707, 150708, 150708, 150709],
    'navn': ['Mo', 'Kari', 'Ola', 'Olga', 'Ali'],
    'email': ['mo@nav.no', 'kari@nav.no', 'ola@nav.no', 'olga@nav.no', 'ali@nav.no'],
    'phone': ['+4798079896', '+4798765434', '+4799887766', '+4745983421', '+4745989933'],
    'addresse': ['Moveien 9', 'Kari gate 34', 'Ola parken 17', 'Olga smuget 55', 'Ali alé 99'],
    'by': ['Oslo', 'Bærum', 'Asker', 'Oslo', 'Oslo'],
    'fylke': ['Oslo', 'Viken', 'Viken', 'Oslo', 'Oslo'],
    'zip': ['0456', '1550', '1750', '0954', '0134'],
    'ansatt_siden': ['2020-01-01', '2019-01-02', '2017-01-03', '2022-01-04', '2022-01-05'],
    'lonn': [780000, 820000, 915000, 825000, 917580]
})

# Initialize a Zipndel object with custom values
zipper = Zipndel(
    file_name='df',
    file_format='csv',
    self_destruct_time=(0, 5, 0), # 24 hours until self-destruct
    #password='my_password',
    encryption_algorithm='AES',
    mask_columns=['email', 'phone'],
    anonymize_columns=['addresse', 'lonn'],
    compliance_check=True,
    audit_trail=True
)

zipper.zipit(df)

Enter password:  ········


In [13]:
df = Unzipndel().unzipit()
df

Password:  ········


Unnamed: 0,ident,navn,email,phone,addresse,by,fylke,zip,ansatt_siden,lonn
0,150706,Mo,337fbccd2639a442e418a8468f502088f76760685e7937...,91fab6f7870a67a1e0d7df582fa093651e89b928f04298...,U9DR9SS8HT,Oslo,Oslo,456,2020-01-01,JLJJVF7H1C
1,150707,Kari,b394a6c9d8bc8757e0b5c5ae8fc4c9367f491ab277b6b6...,81f45b5a8e44c620ae29a1b7cce316e3a1b9421441d6e9...,H8910I93VM,Bærum,Viken,1550,2019-01-02,NUZE8N3DT7
2,150708,Ola,4ea56fd6aa5e00ac9dbb227ae6340be5124e697a384b75...,1172e3725453f484dcbc0e508d4a46682cf4569cbe75a5...,UUSQMBBSHI,Asker,Viken,1750,2017-01-03,P7J1FTLAUT
3,150708,Olga,7599e434b49a98898e394b7dc8bf3d4cad961520a5abe4...,4351ade48e331857e87812f6da4ff25ffad291ac3a41b2...,06KSJ300XY,Oslo,Oslo,954,2022-01-04,Q73HJR6INI
4,150709,Ali,9844050e343edbbc791c393e11e2ce52dd00740749fb7f...,d95e8ccbe88c159fed81ad29cd480970a856449155c6e8...,81XBUNLV5R,Oslo,Oslo,134,2022-01-05,PBZD6QPTWW


In [8]:
# Initialize a Zipndel object with custom values
zipper = Zipndel(
    file_name='df',
    file_format='csv',
    self_destruct_time=(0, 5, 0), # 24 hours until self-destruct
    #password='my_password',
    encryption_algorithm='AES',
    #mask_columns=['Email', 'Phone'],
    #anonymize_columns=['Address', 'Lonn'],
    compliance_check=True,
    audit_trail=True
)

zipper.zipit(df)

Enter password:  ········


In [16]:
df = Unzipndel().unzipit()
df

Password:  ········


Unnamed: 0,ident,navn,email,phone,addresse,by,fylke,zip,ansatt_siden,lonn
0,150706,Mo,337fbccd2639a442e418a8468f502088f76760685e7937...,91fab6f7870a67a1e0d7df582fa093651e89b928f04298...,2DXXPD0OLT,Oslo,Oslo,456,2020-01-01,YU0TA3LYPQ
1,150707,Kari,b394a6c9d8bc8757e0b5c5ae8fc4c9367f491ab277b6b6...,81f45b5a8e44c620ae29a1b7cce316e3a1b9421441d6e9...,1X0CCDN27S,Bærum,Viken,1550,2019-01-02,6KV9ATVR0W
2,150708,Ola,4ea56fd6aa5e00ac9dbb227ae6340be5124e697a384b75...,1172e3725453f484dcbc0e508d4a46682cf4569cbe75a5...,KLK2HG2ZDS,Asker,Viken,1750,2017-01-03,76JGU9JD0K
3,150708,Olga,7599e434b49a98898e394b7dc8bf3d4cad961520a5abe4...,4351ade48e331857e87812f6da4ff25ffad291ac3a41b2...,QACWUKKK11,Oslo,Oslo,954,2022-01-04,YKD35QLXMI
4,150709,Ali,9844050e343edbbc791c393e11e2ce52dd00740749fb7f...,d95e8ccbe88c159fed81ad29cd480970a856449155c6e8...,DZI62R0EPZ,Oslo,Oslo,134,2022-01-05,IWS3K3R7SV
