In [10]:
import pandas as pd
import json
import re
import sys
from io import StringIO

#  Regex mapping for all the standalone PIIs
REGEX_MAP = {
    'phone': re.compile(r'\b\d{10}\b'),
    'aadhar': re.compile(r'\b\d{4}\s?\d{4}\s?\d{4}\b'),
    'passport': re.compile(r'\b[A-Z]\d{7}\b'),
    'upi_id': re.compile(r'\b[\w\d._%+-]+@[\w\d.-]+\b|\b\d{10}@\w+\b')
}

# Masking the info function
def mask_sensitive(info, category):
    if category == 'phone':
        return info[:2] + 'XXXXXX' + info[-2:]
    elif category == 'aadhar':
        return 'XXXX XXXX ' + info[-4:]
    elif category == 'passport':
        return info[0] + 'XXXXXXX'
    elif category == 'upi_id':
        return 'XXXX@' + info.split('@')[-1]
    elif category == 'name':
        parts = info.split(' ')
        if len(parts) >= 2:
            return parts[0][0] + 'XXX ' + parts[-1][0] + 'XXXX'
        else:
            return '[REDACTED_PII]'
    else:
        return '[REDACTED_PII]'

#PII detection and redacting
def check_and_redact(record_json):
    try:
        record_data = json.loads(record_json)
    except:
        return record_json, False

    pii_detected = False

    # Combo PIIs
    combinatorial_keys = ['name', 'email', 'address', 'device_id', 'ip_address']
    present_combinatorial = [k for k in combinatorial_keys if k in record_data and record_data[k]]
    if len(present_combinatorial) >= 2:
        for key in present_combinatorial:
            if key == 'name':
                record_data[key] = mask_sensitive(record_data[key], 'name')
            else:
                record_data[key] = '[REDACTED_PII]'
        pii_detected = True

    # Standalone PIIs
    for key in ['phone', 'aadhar', 'passport', 'upi_id']:
        if key in record_data and record_data[key]:
            value_str = str(record_data[key])
            pattern = REGEX_MAP[key]
            if pattern.search(value_str):
                record_data[key] = mask_sensitive(value_str, key)
                pii_detected = True

    return json.dumps(record_data), pii_detected

# input CSV processing
def process_csv(file_content):
    df = pd.read_csv(StringIO(file_content.decode()))
    df.columns = [c.strip() for c in df.columns]  # Normalize column names

    # Detecting the JSON columns
    json_col = None
    for col in df.columns:
        if 'json' in col.lower():
            json_col = col
            break
    if json_col is None:
        raise ValueError("No JSON column found in CSV")

    redacted_rows = []
    pii_flags = []

    for _, row in df.iterrows():
        redacted_json, is_pii = check_and_redact(row[json_col])
        redacted_rows.append(redacted_json)
        pii_flags.append(is_pii)

    df['redacted_data_json'] = redacted_rows
    df['is_pii'] = pii_flags
    output_file = 'redacted_output_uploaded.csv'
    df[['record_id', 'redacted_data_json', 'is_pii']].to_csv(output_file, index=False)
    print(f'Redacted output saved to {output_file}')
    return output_file

# input handling process
def main():
    try:
        # if running in Colab
        from google.colab import files
        uploaded = files.upload()
        for fn in uploaded.keys():
            process_csv(uploaded[fn])
    except ImportError:
        # if Not Colab, try jupytr notebook upload with ipywidgets
        try:
            from ipywidgets import FileUpload
            from IPython.display import display

            uploader = FileUpload(accept='.csv', multiple=False)
            display(uploader)

            def on_upload_change(change):
                if uploader.value:
                    uploaded_file = list(uploader.value.values())[0]
                    process_csv(uploaded_file['content'])

            uploader.observe(on_upload_change, names='value')

        except ImportError:
            # if running in a plain Python script via Command line
            if len(sys.argv) < 2:
                print("Usage: python detector_full_pranavi.py <csv_file>")
                return
            file_path = sys.argv[1]
            with open(file_path, 'rb') as f:
                process_csv(f.read())

# Run the pogram
if __name__ == "__main__":
    main()


Saving iscp_pii_dataset_-_Sheet1.csv to iscp_pii_dataset_-_Sheet1 (3).csv
Redacted output saved to redacted_output_uploaded.csv


In [11]:
from google.colab import files
files.download('redacted_output_uploaded.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>