In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Load your dataset
train_data = pd.read_csv('2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv')

# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

train_data['text'] = train_data['text'].apply(preprocess_text)

# Split the data
from sklearn.model_selection import train_test_split

X = train_data['text']
y = train_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Create a pipeline with Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('logreg', LogisticRegression())
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# GUI setup
import tkinter as tk
from tkinter import messagebox
from tkinter import scrolledtext

# Function to predict offensive or non-offensive
def classify_post():
    new_post = text_input.get("1.0", 'end-1c')
    if not new_post:
        messagebox.showwarning("Warning", "Please enter a post to classify.")
        return
    
    new_post_preprocessed = preprocess_text(new_post)  # Preprocess the input text
    prediction = pipeline.predict([new_post_preprocessed])
    if prediction == 1:
        result_label.config(text="Offensive post detected.", fg="red")
    else:
        result_label.config(text="Non-offensive post.", fg="green")

# GUI setup
root = tk.Tk()
root.title("Offensive Post Detector")

# Text input area
text_input = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=40, height=10, font=("Arial", 12))
text_input.pack(pady=10)

# Button to classify
classify_button = tk.Button(root, text="Classify", width=10, height=2, command=classify_post)
classify_button.pack()

# Result label
result_label = tk.Label(root, text="", font=("Arial", 14, "bold"), fg="black")
result_label.pack(pady=10)

# Start GUI
root.mainloop()


              precision    recall  f1-score   support

        hate       0.68      0.74      0.71     11048
     nothate       0.66      0.59      0.62      9264

    accuracy                           0.67     20312
   macro avg       0.67      0.67      0.67     20312
weighted avg       0.67      0.67      0.67     20312

[[8181 2867]
 [3792 5472]]
