In [2]:
import difflib
import re
import pandas as pd

# === Load Dataset ===
df = pd.read_csv(r"C:\Users\SUTAMA SARKAR\Downloads\twitter.csv")  # Use uploaded dataset path
df = df[['tweet', 'class']].dropna()
label_map = {0: "Hate Speech", 1: "Offensive Language", 2: "Neither"}
df['label_text'] = df['class'].map(label_map)

# Prepare dataset
sentence_list = df['tweet'].astype(str).tolist()
labels = df['class'].tolist()
dataset = list(zip(sentence_list, labels))

# === Define Offensive Root Words ===
offensive_base_words = [
    "bitch", "bastard", "slut", "whore", "dumb", "idiot", "fool", "moron",
    "asshole", "fuck", "fucking", "shit", "dick", "pussy", "jerk", "loser",
    "cunt", "retard", "sex", "sexy", "bloody", "motherfucker", "hoe"
]

# === Normalize words ===
def guess_offensive_root(word, candidates=offensive_base_words):
    word = word.lower()
    word = re.sub(r'[^a-z]', '', word)  # remove non-letters like *, $, etc.
    word = re.sub(r'(.)\1{2,}', r'\1', word)  # fuuuck → fuck, blllloooody → bloody
    matches = difflib.get_close_matches(word, candidates, n=1, cutoff=0.6)
    return matches[0] if matches else None

def reverse_transform_sentence(sentence):
    words = sentence.split()
    transformed = []
    for word in words:
        root = guess_offensive_root(word)
        transformed.append(root if root else word)
    return " ".join(transformed)

# === Fuzzy match to dataset ===
def match_probability(transformed_input, dataset, top_n=3):
    scored = [
        (sent, label, difflib.SequenceMatcher(None, transformed_input.lower(), sent.lower()).ratio())
        for sent, label in dataset
    ]
    scored = [s for s in scored if s[2] >= 0.5]
    scored.sort(key=lambda x: x[2], reverse=True)
    return scored[:top_n]




# Offensive Detection Function
def detect_offensive_english_input(user_input):
    transformed = reverse_transform_sentence(user_input)
    #print(f"\n Transformed Input: {transformed}")

    # Check for presence of actual offensive root words
    words = transformed.split()
    roots_detected = [w for w in words if w in offensive_base_words]

    top_matches = match_probability(transformed, dataset)

    if not top_matches and not roots_detected:
        print("\n Final Prediction: Not Offensive")
        print(" Confidence Score: Very Low (no offensive words or matches)")
        return

    for match, label, score in top_matches:
        print(f"\nMatched Tweet: {match}")
        print(f"Label: {label_map[label]} | Similarity: {score:.2f}")

    if top_matches:
        avg_score = sum(score for _, _, score in top_matches) / len(top_matches)
        avg_label = round(sum(label for _, label, _ in top_matches) / len(top_matches))
    else:
        avg_score = 0
        avg_label = 2  # Default to 'Neither'

    # Smarter final decision: if no offensive roots and only weak matches
    if not roots_detected and avg_label != 0 and avg_score < 0.65:
        print("\n Final Prediction: Not Offensive")
        print(f" Confidence Score: {avg_score:.2f} (no strong match or root word)")
        return

    print("\n Final Prediction:")
    print(f"Label: {label_map[avg_label]}")
    print(f" Confidence Score (average similarity): {avg_score:.2f}")
if __name__ == "__main__":
    user_text = input("Enter an English sentence to analyze: ")
    detect_offensive_english_input(user_text)


Enter an English sentence to analyze:  you are a kind person



Matched Tweet: You a hoe and errbody know
Label: Offensive Language | Similarity: 0.64

Matched Tweet: you're all niggers
Label: Hate Speech | Similarity: 0.62

Matched Tweet: You're a stupid hoe
Label: Offensive Language | Similarity: 0.60

 Final Prediction: Not Offensive
 Confidence Score: 0.62 (no strong match or root word)
