In [1]:
import nltk
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Download required NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Teena\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Teena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Teena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Teena\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
train_data = pd.read_csv('data/train.csv').fillna(' ')

In [4]:
def cleanData(text):    
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [5]:
train_data['comment_text'] = train_data['comment_text'].fillna("_na_").apply(cleanData)

#adding a column for non-toxic
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_data['none'] = 1 - train_data[label_cols].max(axis=1)

In [6]:
# POS tag mapping
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

In [7]:
# Regex patterns
special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)
replace_numbers = re.compile(r'\d+', re.IGNORECASE)

def text_to_wordlist(text):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    #Replace Numbers
    text=replace_numbers.sub('_num_',text)

    text = text.split()

    #remove stopwords
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]

    # Initialize lemmatizer and using....
    lemmatizer = WordNetLemmatizer()
    stemmed_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text]
    text = " ".join(stemmed_words)
    
    return(text)

In [None]:
x = train_data['comment_text']
x = [text_to_wordlist(i) for i in x]
x[:10]

['explanation edits make username hardcore metallica fan revert vandalism closure gas vote new york doll fac please remove template talk page since retire _num_ _num_ _num_ _num_',
 'aww match background colour seemingly stuck thanks talk _num_ _num_ january _num_ _num_ utc',
 'hey man really try edit war guy constantly remove relevant information talk edits instead talk page seem care format actual info',
 'cannot make real suggestion improvement wonder section statistic later subsection type accident think reference may need tidy exact format ie date format etc later one else first preference format style reference want please let know appear backlog article review guess may delay reviewer turn list relevant form eg wikipedia good article nomination transport',
 'sir hero chance remember page',
 'congratulation well use tool well talk',
 'cocksucker piss around work',
 'vandalism matt shirvington article revert please ban',
 'sorry word nonsense offensive anyway intend write anything

In [10]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(x)  # 'x' is your lemmatized training comments
# y_train = train_data[label_cols][:100]  # Adjust slice or use full data

y_train = train_data[label_cols] #target that should be used

In [None]:
test_data = pd.read_csv('data/test.csv').fillna(' ')
test_data['content'] = test_data['content'].apply(cleanData)
x_test = [text_to_wordlist(i) for i in test_data['content']]
X_test = vectorizer.transform(x_test)  # Use transform, not fit_transform

In [None]:

models = {}
for label in label_cols:
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train[label])
    models[label] = model


In [None]:
from sklearn.metrics import accuracy_score

for label in label_cols:
    preds = models[label].predict(X_test)

preds


In [None]:
custom_comment="i will kill you"
custom_cleaned=cleanData(custom_comment)
processed= text_to_wordlist(custom_cleaned)
custom_vectorized = vectorizer.transform([processed])
custom_preds={}

for label in label_cols:
    custom_preds[label] = models[label].predict(custom_vectorized)[0]

custom_preds

In [None]:
active_labels = [label for label, pred in custom_preds.items() if pred == 1]
if active_labels:   
    label_str = ', '.join(active_labels)
    print(f"{custom_comment} is predicted as: {label_str}")
else:
    print(f'{custom_comment} is not toxic')

In [20]:
import pickle

with open("backend/toxic_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("backend/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)