In [None]:
import difflib
import re
import pandas as pd

# Load the uploaded dataset
df = pd.read_csv(r"C:\Users\SUTAMA SARKAR\Downloads\Bengali hate speech .csv")

sentence_list = df['sentence'].astype(str).tolist()
hate_labels = df['hate'].tolist()
dataset = list(zip(sentence_list, hate_labels))

# Expanded list of offensive base words
offensive_base_words = [
    "শালা", "হারামি", "চুদ", "মাদারচোদ", "লুচ্চা", "বদমাশ", "চরিত্রহীন",
    "খানকি", "রেন্ডি", "চোদ", "মাগি", "তোর মায়ের", "বেশ্যা", "নষ্টা", "চুদি"
]

# Try to reverse obfuscation to root words
def guess_offensive_root(word, candidates=offensive_base_words):
    clean_word = re.sub(r"[^\u0980-\u09FF]", "", word)  # keep only Bengali characters
    matches = difflib.get_close_matches(clean_word, candidates, n=1, cutoff=0.5)
    return matches[0] if matches else None

# Convert an obfuscated sentence to possible root words
def reverse_transform_sentence(sentence):
    words = sentence.split()
    transformed_words = []
    for word in words:
        guess = guess_offensive_root(word)
        transformed_words.append(guess if guess else word)
    return " ".join(transformed_words)

# Compare input with dataset and return top similar matches
def match_probability(transformed_input, dataset, top_n=3):
    scored = [(sent, label, difflib.SequenceMatcher(None, transformed_input, sent).ratio()) 
              for sent, label in dataset]
    scored.sort(key=lambda x: x[2], reverse=True)
    return scored[:top_n]

# Main detection function
def detect_offensive_bengali_input(user_input):
    transformed = reverse_transform_sentence(user_input)
    top_matches = match_probability(transformed, dataset)

   # print(f"\n Transformed Sentence: {transformed}")
    for match, label, score in top_matches:
        print(f"\n Match: {match}")
        print(f" Similarity: {score:.2f}")
        print(f" Offensive: {'Yes' if label == 1 else 'No'}")

    avg_score = sum(score for _, _, score in top_matches) / len(top_matches)
    avg_label = round(sum(label for _, label, _ in top_matches) / len(top_matches))

    print(f"\n Final Verdict: {'Offensive' if avg_label == 1 else 'Not Offensive'}")
    print(f"📈 Confidence (avg similarity): {avg_score:.2f}")


if __name__ == "__main__":
    user_text = input(" Enter a Bengali sentence to analyze: ")
    detect_offensive_bengali_input(user_text)


 Enter a Bengali sentence to analyze:  তুই একটা বোকাচোদা
