In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Load the tokenizer and model from the local directory
model_path = r'D:\Projects\SIH\RE-DACT\redact\app\services\deberta_finetuned_pii'  # Fix this path thingy according to your convenience

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Create the pipeline with your local model
gen = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="first")

text = "My name is John and I live in California."
output = gen(text)

print(output)

[{'entity_group': 'FIRSTNAME', 'score': 0.95468575, 'word': ' John', 'start': 10, 'end': 15}, {'entity_group': 'STATE', 'score': 0.98806274, 'word': ' California.', 'start': 29, 'end': 41}]




In [4]:
def tag_model_output(model_output):
    """
    Returns:
    list: A list of dictionaries with an additional 'tag' key indicating the level.
    """
    level_1 = {
        "SSN", "CREDITCARDNUMBER", "CREDITCARDCVV", "PASSWORD", "IP", "MAC",
        "BITCOINADDRESS", "ETHEREUMADDRESS", "LITECOINADDRESS", "ACCOUNTNUMBER",
        "IBAN", "BIC"
    }
    
    level_2 = {
        "FIRSTNAME", "LASTNAME", "FULLNAME", "NAME", "EMAIL", "PHONE_NUMBER",
        "STREETADDRESS", "CITY", "ZIPCODE", "STATE", "COUNTRY", "JOBTITLE",
        "COMPANY_NAME", "USERNAME"
    }
    
    level_3 = {
        "PREFIX", "MIDDLENAME", "SUFFIX", "JOBDESCRIPTOR", "JOBAREA",
        "SECONDARYADDRESS", "COUNTY", "CURRENCY", "CURRENCYSYMBOL",
        "CURRENCYCODE", "USERAGENT", "SEX", "GENDER", "NEARBYGPSCOORDINATE",
        "DISPLAYNAME", "SEXTYPE", "ORDINALDIRECTION"
    }
    
    def tag_word(word):
        if word in level_1:
            return 1
        elif word in level_2:
            return 2
        elif word in level_3:
            return 3
        else:
            return 999  # Default to level 9 for all other items not classified

    for entity in model_output:
        entity['tag'] = tag_word(entity['entity_group'])
    
    return model_output

# Example usage
tagged_output = tag_model_output(output)
print(tagged_output)

[{'entity_group': 'FIRSTNAME', 'score': 0.95468575, 'word': ' John', 'start': 10, 'end': 15, 'tag': 2}, {'entity_group': 'STATE', 'score': 0.98806274, 'word': ' California.', 'start': 29, 'end': 41, 'tag': 2}]


In [5]:
def classify_and_tag_text(input_text):
    # Returns a dictionary with words as keys and their corresponding tags as values.
    classified_entities = gen(input_text)
    tagged_entities = tag_model_output(classified_entities)
    word_tag_dict = {entity['word'].strip(): entity['tag'] for entity in tagged_entities}
    return word_tag_dict

# Example usage
example_text = "Mark Davis, born on July 22, 1984, currently resides at 456 Oakwood Avenue, Dallas, TX 75201. He works as a marketing manager at Crescent Media Group. His email is mark.davis@crescentmedia.com, and his phone number is (214) 987-6543. "
word_tag_mapping = classify_and_tag_text(example_text)
print(word_tag_mapping)

{'Mark': 2, 'Davis,': 3, 'July 22, 1984,': 999, '456 Oakwood Avenue,': 2, 'Dallas,': 2, 'TX': 2, '75201.': 2, 'Crescent Media Group.': 2, 'mark.davis@crescentmedia.com,': 2, '(214) 987-6543.': 2}


In [6]:
def redact_text(input_text, level):
    # Redacts words in the input text based on the specified classification level.
    
    # Args:
    # input_text (str): The text to be classified and redacted.
    # level (int): The classification level (1, 2, or 3).
    
    # Returns: str: The redacted text.
    # Classify and tag the text
    word_tag_mapping = classify_and_tag_text(input_text)
    
    # Split the input text into words
    words = input_text.split()
    
    # Redact words based on the specified level
    redacted_words = [
        '■■■■■' if word_tag_mapping.get(word.strip(), 4) <= level else word
        for word in words
    ]
    
    # Join the redacted words back into a single string
    redacted_text = ' '.join(redacted_words)
    
    return redacted_text

# Example usage
redacted_text_level_2 = redact_text(example_text, 2)
print("Word Tag Mapping:", word_tag_mapping)
print(redacted_text_level_2)

redacted_text_level_3 = redact_text(example_text, 3)
print(redacted_text_level_3)

Word Tag Mapping: {'Mark': 2, 'Davis,': 3, 'July 22, 1984,': 999, '456 Oakwood Avenue,': 2, 'Dallas,': 2, 'TX': 2, '75201.': 2, 'Crescent Media Group.': 2, 'mark.davis@crescentmedia.com,': 2, '(214) 987-6543.': 2}
■■■■■ Davis, born on July 22, 1984, currently resides at 456 Oakwood Avenue, ■■■■■ ■■■■■ ■■■■■ He works as a marketing manager at Crescent Media Group. His email is ■■■■■ and his phone number is (214) 987-6543.
■■■■■ ■■■■■ born on July 22, 1984, currently resides at 456 Oakwood Avenue, ■■■■■ ■■■■■ ■■■■■ He works as a marketing manager at Crescent Media Group. His email is ■■■■■ and his phone number is (214) 987-6543.


In [7]:
def redact_text(input_text, level):
    # Classify and tag the text
    word_tag_mapping = classify_and_tag_text(input_text)
    
    # Initialize variables
    words = input_text.split()
    redacted_words = []
    i = 0
    
    # Iterate through the words
    while i < len(words):
        word = words[i]
        # Check for multi-word entities
        for entity, tag in word_tag_mapping.items():
            entity_words = entity.split()
            if words[i:i+len(entity_words)] == entity_words:
                if tag <= level:
                    redacted_words.append('■■■■■' * len(entity_words))
                else:
                    redacted_words.extend(entity_words)
                i += len(entity_words) - 1
                break
        else:
            # If no multi-word entity is found, process the single word
            if word_tag_mapping.get(word.strip(), 4) <= level:
                redacted_words.append('■■■■■')
            else:
                redacted_words.append(word)
        i += 1
    
    # Join the redacted words back into a single string
    redacted_text = ' '.join(redacted_words)
    
    return redacted_text

# Example usage
redacted_text_level_2 = redact_text(example_text, 2)
print("Word Tag Mapping:", classify_and_tag_text(example_text))
print(redacted_text_level_2)

redacted_text_level_3 = redact_text(example_text, 3)
print(redacted_text_level_3)

redacted_text_level_1 = redact_text(example_text, 1)
print(redacted_text_level_1)

Word Tag Mapping: {'Mark': 2, 'Davis,': 3, 'July 22, 1984,': 999, '456 Oakwood Avenue,': 2, 'Dallas,': 2, 'TX': 2, '75201.': 2, 'Crescent Media Group.': 2, 'mark.davis@crescentmedia.com,': 2, '(214) 987-6543.': 2}
■■■■■ Davis, born on July 22, 1984, currently resides at ■■■■■■■■■■■■■■■ ■■■■■ ■■■■■ ■■■■■ He works as a marketing manager at ■■■■■■■■■■■■■■■ His email is ■■■■■ and his phone number is ■■■■■■■■■■
■■■■■ ■■■■■ born on July 22, 1984, currently resides at ■■■■■■■■■■■■■■■ ■■■■■ ■■■■■ ■■■■■ He works as a marketing manager at ■■■■■■■■■■■■■■■ His email is ■■■■■ and his phone number is ■■■■■■■■■■
Mark Davis, born on July 22, 1984, currently resides at 456 Oakwood Avenue, Dallas, TX 75201. He works as a marketing manager at Crescent Media Group. His email is mark.davis@crescentmedia.com, and his phone number is (214) 987-6543.
