In [109]:
from transformers import pipeline
import re



In [110]:
# One method

def load_profanity_word_lists(file_paths):
    profanity_words = set()
    for file_path in file_paths:
        with open(file_path, "r") as f:
            words = [word.strip() for word in f.readlines()]
            variations = set()
            for word in words:
                variations.add(word)
                variations.add(word.lower())
                variations.add(word.upper())
                variations.add(word.capitalize())
                variations.update([
                    word + 's', word + 'es', word + 'ed', word + 'ing',
                    word.lower() + 's', word.lower() + 'es', word.lower() + 'ed', word.lower() + 'ing',
                    word.upper() + 'S', word.upper() + 'ES', word.upper() + 'ED', word.upper() + 'ING',
                    word.capitalize() + 's', word.capitalize() + 'es', word.capitalize() + 'ed', word.capitalize() + 'ing'
                ])
            profanity_words.update(variations)
    return profanity_words



In [111]:
file_paths = ["bad words_1.txt","crime_words.txt","negative-words.txt"]

profanity_words = load_profanity_word_lists(file_paths) # profanity word vako list harulai load gareko


In [112]:
def filter_profanity(text):
    clean_text = re.sub(r'\W+', ' ', text.lower())

    # Replace profanity words with asterisks
    for word in profanity_words:
        clean_text = re.sub(r'\b{}\b'.format(re.escape(word)), '*' * len(word), clean_text)

    return clean_text

def contains_profanity(text):

    words = re.findall(r'\b\w+\b', text.lower())# Split text into words
    if any(word in profanity_words for word in words):
        return True
    return False

def detect_profanity_in_texts(input_texts):
    results = []
    for text in input_texts:
        filtered_text = filter_profanity(text)
        ethical = not contains_profanity(text)
        results.append({"text": text, "filtered_text": filtered_text, "ethical": ethical})
    return results

In [113]:
# Example usage with multiple input texts
input_texts = ["How can I murder someone?", "He was murdered last night.", "I love you", "You bitch", "I killed a cat", "ass", "you are so beautiful","mutherfucker","theft"]
results = detect_profanity_in_texts(input_texts)
for result in results:
    print("Text:", result["text"])
    print("Filtered Text:", result["filtered_text"])
    print("Ethical:", result["ethical"])
    print("-" * 50)

Text: How can I murder someone?
Filtered Text: how can i ****** someone 
Ethical: False
--------------------------------------------------
Text: He was murdered last night.
Filtered Text: he was ******** last night 
Ethical: False
--------------------------------------------------
Text: I love you
Filtered Text: i love you
Ethical: True
--------------------------------------------------
Text: You bitch
Filtered Text: you *****
Ethical: False
--------------------------------------------------
Text: I killed a cat
Filtered Text: i ****** a cat
Ethical: False
--------------------------------------------------
Text: ass
Filtered Text: ***
Ethical: False
--------------------------------------------------
Text: you are so beautiful
Filtered Text: you are so beautiful
Ethical: True
--------------------------------------------------
Text: mutherfucker
Filtered Text: ************
Ethical: False
--------------------------------------------------
Text: theft
Filtered Text: *****
Ethical: False
--

In [114]:
#Using bag of words
import re
from sklearn.feature_extraction.text import CountVectorizer





In [115]:
def load_profanity_word_lists(file_paths):
    profanity_words = set()
    for file_path in file_paths:
        with open(file_path, "r") as f:
            words = [word.strip() for word in f.readlines()]
            # Add variations of profanity words
            variations = set()
            for word in words:
                variations.add(word)
                variations.add(word.lower())
                variations.add(word.upper())
                variations.add(word.capitalize())
                # Add variations with suffixes
                variations.update([
                    word + 's', word + 'es', word + 'ed', word + 'ing',
                    word.lower() + 's', word.lower() + 'es', word.lower() + 'ed', word.lower() + 'ing',
                    word.upper() + 'S', word.upper() + 'ES', word.upper() + 'ED', word.upper() + 'ING',
                    word.capitalize() + 's', word.capitalize() + 'es', word.capitalize() + 'ed', word.capitalize() + 'ing'
                ])
            profanity_words.update(variations)
    return profanity_words



In [116]:
file_paths = ["bad words_1.txt", "crime_words.txt", "negative-words.txt"]

profanity_words = load_profanity_word_lists(file_paths)

def filter_profanity(text):
    words = re.findall(r'\b\w+\b', text.lower())
    text = ' '.join(words)# Convert the list of words into a single string

    vectorizer = CountVectorizer(vocabulary=profanity_words, lowercase=False)


    bow = vectorizer.transform([text])


    filtered_text = re.sub(r'\b\w+\b', lambda m: '*' * len(m.group(0)) if m.group(0) in profanity_words else m.group(0), text)# Replace profanity words with asterisks in the original text

    return filtered_text

def contains_profanity(text):

    words = re.findall(r'\b\w+\b', text.lower())


    if any(word in profanity_words for word in words):
        return True
    return False

def detect_profanity_in_texts(input_texts):
    results = []
    for text in input_texts:
        filtered_text = filter_profanity(text)
        ethical = not contains_profanity(text)
        results.append({"text": text, "filtered_text": filtered_text, "ethical": ethical})
    return results


In [117]:
# Example usage with multiple input texts
input_texts = ["How can I murder someone?", "He was murdered last night.", "I love you", "You bitch", "I killed a cat", "ass", "you are so beautiful","mutherfucker","theft"]
results = detect_profanity_in_texts(input_texts)
for result in results:
    print("Text:", result["text"])
    print("Filtered Text:", result["filtered_text"])
    print("Ethical:", result["ethical"])
    print("-" * 50)

Text: How can I murder someone?
Filtered Text: how can i ****** someone
Ethical: False
--------------------------------------------------
Text: He was murdered last night.
Filtered Text: he was ******** last night
Ethical: False
--------------------------------------------------
Text: I love you
Filtered Text: i love you
Ethical: True
--------------------------------------------------
Text: You bitch
Filtered Text: you *****
Ethical: False
--------------------------------------------------
Text: I killed a cat
Filtered Text: i ****** a cat
Ethical: False
--------------------------------------------------
Text: ass
Filtered Text: ***
Ethical: False
--------------------------------------------------
Text: you are so beautiful
Filtered Text: you are so beautiful
Ethical: True
--------------------------------------------------
Text: mutherfucker
Filtered Text: ************
Ethical: False
--------------------------------------------------
Text: theft
Filtered Text: *****
Ethical: False
----