In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score


In [21]:
df = pd.read_csv('CommentToxicity-main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv')
X = df['comment_text']
y = df[df.columns[2:]]

In [22]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [23]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [24]:
# Splitting the dataset into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)


In [25]:
# Vectorizing the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)


In [26]:
# Creating and training Naive Bayes models for each column
models = {}
for column in y_train.columns:
    model = MultinomialNB()
    model.fit(X_train_vec, y_train[column])
    models[column] = model


In [27]:

# Making predictions
input_text = ["you black fatso"]
input_text_vec = vectorizer.transform(input_text)

predictions = {}
for column, model in models.items():
    predictions[column] = model.predict(input_text_vec)

print("Predictions:",predictions)

Predictions: {'toxic': array([1], dtype=int64), 'severe_toxic': array([0], dtype=int64), 'obscene': array([1], dtype=int64), 'threat': array([0], dtype=int64), 'insult': array([1], dtype=int64), 'identity_hate': array([0], dtype=int64)}


In [31]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
import numpy as np

# Predictions and ground truth labels for all columns
y_val_pred_concat = np.hstack([model.predict(X_val_vec).reshape(-1, 1) for model in models.values()])
y_val_true_concat = y_val.values

# Evaluate the concatenated predictions
precision = precision_score(y_val_true_concat.flatten(), y_val_pred_concat.flatten(), average='micro')
recall = recall_score(y_val_true_concat.flatten(), y_val_pred_concat.flatten(), average='micro')
accuracy = accuracy_score(y_val_true_concat.flatten(), y_val_pred_concat.flatten())

# Print evaluation metrics
print(f"Overall Model Evaluation:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

Overall Model Evaluation:
Precision: 0.9750820650552074
Recall: 0.9750820650552074
Accuracy: 0.9750820650552074
