In [1]:
import pandas as pd

In [2]:
dftrain = pd.read_csv('train.csv')
dftest = pd.read_csv('test.csv')
test_labels_df = pd.read_csv('test_labels.csv')

In [6]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Data Cleaning

In [8]:
dftrain.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [9]:
dftest.isnull().sum()

id              0
comment_text    0
dtype: int64

In [10]:
test_labels_df.isnull().sum()

id               0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

# Data Preprocessing

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [14]:
dftrain['comment_text'] = dftrain['comment_text'].str.lower().str.replace('[^\w\s]', '')

In [15]:
vectorizer = CountVectorizer(stop_words='english', max_features=10000)

In [16]:
vectorizer

In [18]:
X = vectorizer.fit_transform(dftrain['comment_text'])

In [19]:
y = dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier

In [22]:
model = MultinomialNB()

In [24]:
multi_target_nb = MultiOutputClassifier(model, n_jobs=-1)

In [25]:
multi_target_nb.fit(X_train, y_train)

# Model Testing

In [26]:
from sklearn.metrics import classification_report

In [27]:
y_pred = multi_target_nb.predict(X_val)

In [28]:
for i, target in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    print(f"Classification report for {target}:")
    print(classification_report(y_val[target], y_pred[:, i]))

Classification report for toxic:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     28859
           1       0.74      0.70      0.72      3056

    accuracy                           0.95     31915
   macro avg       0.86      0.84      0.85     31915
weighted avg       0.95      0.95      0.95     31915

Classification report for severe_toxic:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     31594
           1       0.35      0.70      0.47       321

    accuracy                           0.98     31915
   macro avg       0.67      0.84      0.73     31915
weighted avg       0.99      0.98      0.99     31915

Classification report for obscene:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     30200
           1       0.69      0.77      0.73      1715

    accuracy                           0.97     31915
   macro avg       0

In [30]:
dftest['comment_text'] = dftest['comment_text'].str.lower().str.replace('[^\w\s]', '')
X_test = vectorizer.transform(dftest['comment_text'])

In [31]:
test_predictions = multi_target_nb.predict(X_test)

In [32]:
test_pred_df = pd.DataFrame(test_predictions, columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [33]:
test_pred_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1,1,1,1,1,1
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
153159,0,0,0,0,0,0
153160,0,0,0,0,0,0
153161,0,0,0,0,0,0
153162,0,0,0,0,0,0


In [79]:
def preprocess_input(text):
    # Clean the text (lowercase, remove punctuation)
    text = text.lower().replace('[^\w\s]', '')
    # Vectorize the text
    return vectorizer.transform([text])

# Example input text
input_text = input("Enter Input Text \n'")
processed_input = preprocess_input(input_text)

# Make a prediction on the processed input
prediction = multi_target_nb.predict(processed_input)

# Convert the prediction to a more readable format
prediction_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
predicted_classes = {label: pred for label, pred in zip(prediction_labels, prediction[0])}

# Print the results
print("Input Text:", input_text)
print("Predicted Classes:")
for label, value in predicted_classes.items():
    print(f"{label}: {'Yes' if value == 1 else 'No'}")


Enter Input Text 
' a nigga


Input Text: a nigga
Predicted Classes:
toxic: Yes
severe_toxic: No
obscene: Yes
threat: No
insult: Yes
identity_hate: Yes


# Functional Testing
