In [1]:
import pandas as pd

# Loading full dataset 
df = pd.read_csv("C://Users//Bhargavi//Downloads//IR_DATASET.csv")

# Taking a random sample of 10,000 rows 
df_subset = df.sample(n=10000, random_state=42)

# Checking the distribution of target classes
print(df_subset['toxic'].value_counts())


toxic
0    9058
1     942
Name: count, dtype: int64


In [2]:
import re
import nltk
from nltk.corpus import stopwords

# Downloading stopwords 
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Removing special characters, digits, and extra spaces
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    
    # Tokenizing and removing stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Applying preprocessing to the 'comment_text' column
df_subset['comment_text'] = df_subset['comment_text'].apply(preprocess_text)

# Checking if preprocessing is applied correctly
print(df_subset['comment_text'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhargavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


119105    geez forgetful already discussed marx anarchis...
131631    carioca rfa thanks support request adminship f...
125326                 birthday worries enjoy ur day talk e
111256    pseudoscience category assuming article pseudo...
83590     phrase exists would provided search engine eve...
Name: comment_text, dtype: object


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing TF-IDF Vectorizer 
vectorizer = TfidfVectorizer(max_features=10000)

# Transforming the comments into TF-IDF features
X = vectorizer.fit_transform(df_subset['comment_text'])

# Defining the target variable 
y = df_subset['toxic']


In [4]:
from sklearn.model_selection import train_test_split

# Spliting the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the sizes of the splits
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(8000, 10000) (2000, 10000) (8000,) (2000,)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Initializing the Logistic Regression model
log_reg_model = LogisticRegression(solver='liblinear')

# Defining the hyperparameter grid for tuning
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# Using GridSearchCV for hyperparameter tuning (5-fold cross-validation)
grid_search = GridSearchCV(log_reg_model, param_grid, cv=5, n_jobs=-1, verbose=1)

# Training the model using grid search
grid_search.fit(X_train, y_train)

# To get the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Using the best model from grid search to make predictions
best_model = grid_search.best_estimator_

# Making predictions on the test set
y_pred_log_reg = best_model.predict(X_test)

# Evaluating the accuracy and performance
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'C': 10, 'penalty': 'l1'}
Logistic Regression Accuracy: 0.944
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      1801
           1       0.79      0.60      0.68       199

    accuracy                           0.94      2000
   macro avg       0.87      0.79      0.82      2000
weighted avg       0.94      0.94      0.94      2000



In [6]:
from sklearn.naive_bayes import MultinomialNB

# Initializing and training Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Make predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluate the Naive Bayes model
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.9185
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      1801
           1       1.00      0.18      0.31       199

    accuracy                           0.92      2000
   macro avg       0.96      0.59      0.63      2000
weighted avg       0.93      0.92      0.89      2000



In [None]:
import tkinter as tk
from tkinter import messagebox
from PIL import Image, ImageTk  # For handling images
import os

# Function to preprocess the input text
def preprocess_text(text):
    # Adding the logic here, such as lowercasing, removing special characters, etc.
    text = text.lower()
    return text

# Function to predict toxicity of the input text
def predict_toxicity():
    input_text = entry.get("1.0", "end-1c").strip()  # Get text from the text box
    if input_text == "":
        messagebox.showwarning("Input Error", "Please enter a comment.")
        return
    
    preprocessed_text = preprocess_text(input_text)  # Preprocess the input text
    text_vector = vectorizer.transform([preprocessed_text])  # Transform to TF-IDF
    prediction = best_model.predict(text_vector)  # Make prediction
    
    if prediction[0] == 1:
        messagebox.showinfo("Prediction Result", "The comment is Toxic.")
    else:
        messagebox.showinfo("Prediction Result", "The comment is Non-Toxic.")

# Setting up the GUI
root = tk.Tk()
root.title("Toxicity Classifier")

# Making the window full screen
root.attributes('-fullscreen', True)

# Loading the background image
image_path = "C://Users//Bhargavi//Downloads//image_toxic.PNG"  
if os.path.exists(image_path):
    try:
        background_image = Image.open(image_path)
        # Resizing the image to cover the full screen
        screen_width = root.winfo_screenwidth()
        screen_height = root.winfo_screenheight()
        background_image = background_image.resize((screen_width, screen_height), Image.Resampling.LANCZOS)
        bg_image = ImageTk.PhotoImage(background_image)

        # Create a label to display the background image
        background_label = tk.Label(root, image=bg_image)
        background_label.place(x=0, y=0, relwidth=1, relheight=1)

        # Keep a reference to avoid garbage collection
        background_label.image = bg_image
    except Exception as e:
        print(f"Error loading image: {e}")
else:
    print("Image not found")

# Creating a frame in the center for text box and buttons
center_frame = tk.Frame(root, bg="#008080", padx=20, pady=20)
center_frame.place(relx=0.5, rely=0.3, anchor="center")

# Creating a label inside the frame
label = tk.Label(center_frame, text="Enter your comment:", bg="#e6f2ff", font=("Arial", 16))
label.pack(pady=10)

# Creating a text box for input
entry = tk.Text(center_frame, height=8, width=50, font=("Arial", 14))
entry.pack(pady=10)

# Creating a button to trigger prediction
button = tk.Button(center_frame, text="Check Toxicity", command=predict_toxicity, bg="#66b3ff", font=("Arial", 12))
button.pack(pady=10)

# Creating a button to close the application
def close_app():
    root.destroy()

close_button = tk.Button(center_frame, text="Close", command=close_app, bg="red", font=("Arial", 12))
close_button.pack(pady=10)

# Centering the messagebox dialog
root.eval('tk::PlaceWindow . center')

# Run the application
root.mainloop()
