In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

# Load the CSV file
file_path = 'D:/git/ToxicCommentClassification/train.csv'  # Change to the actual path
data = pd.read_csv(file_path)

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'\r\n', ' ', text)  # Replace new line characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Clean the comment_text column
data['comment_text'] = data['comment_text'].apply(clean_text)

# Prepare the data for training
X = data['comment_text'].values
y = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
max_length = 150
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='sigmoid'))  # 6 output neurons for multilabel classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, verbose=1)

# Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f'Test Accuracy: {accuracy:.4f}')

# Make predictions
predictions = model.predict(X_test_pad)
print(predictions)

Epoch 1/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 82ms/step - accuracy: 0.9501 - loss: 0.1222 - val_accuracy: 0.9943 - val_loss: 0.0517
Epoch 2/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 81ms/step - accuracy: 0.9940 - loss: 0.0494 - val_accuracy: 0.9943 - val_loss: 0.0497
Epoch 3/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 81ms/step - accuracy: 0.9931 - loss: 0.0434 - val_accuracy: 0.9919 - val_loss: 0.0492
Epoch 4/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 81ms/step - accuracy: 0.9880 - loss: 0.0385 - val_accuracy: 0.9488 - val_loss: 0.0511
Epoch 5/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 82ms/step - accuracy: 0.9697 - loss: 0.0343 - val_accuracy: 0.9042 - val_loss: 0.0537
Epoch 5: early stopping
Test Accuracy: 0.9014
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step
[[2.89549828e-01 2.30289856e-03 1.03545794e-02 

In [2]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the CSV file


# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=20000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the SVM model
svm = LinearSVC()
multi_target_svm = MultiOutputClassifier(svm, n_jobs=-1)
multi_target_svm.fit(X_train_tfidf, y_train)

# Evaluate the model on the test set
y_pred = multi_target_svm.predict(X_test_tfidf)

# Print classification report
for i, column in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    print(f"Classification report for {column}:\n")
    print(classification_report(y_test[:, i], y_pred[:, i]))
    print("\n")

# Calculate and print overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.4f}')

Classification report for toxic:

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     28859
           1       0.87      0.71      0.78      3056

    accuracy                           0.96     31915
   macro avg       0.92      0.85      0.88     31915
weighted avg       0.96      0.96      0.96     31915



Classification report for severe_toxic:

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     31594
           1       0.53      0.25      0.34       321

    accuracy                           0.99     31915
   macro avg       0.76      0.63      0.67     31915
weighted avg       0.99      0.99      0.99     31915



Classification report for obscene:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30200
           1       0.89      0.69      0.78      1715

    accuracy                           0.98     31915
   macro avg 

In [3]:

from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_forest = MultiOutputClassifier(random_forest, n_jobs=-1)
multi_target_forest.fit(X_train_tfidf, y_train)

# Evaluate the model on the test set
y_pred = multi_target_forest.predict(X_test_tfidf)

# Print classification report
for i, column in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    print(f"Classification report for {column}:\n")
    print(classification_report(y_test[:, i], y_pred[:, i]))
    print("\n")


Classification report for toxic:

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     28859
           1       0.94      0.53      0.68      3056

    accuracy                           0.95     31915
   macro avg       0.94      0.76      0.82     31915
weighted avg       0.95      0.95      0.95     31915



Classification report for severe_toxic:

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     31594
           1       0.46      0.06      0.10       321

    accuracy                           0.99     31915
   macro avg       0.73      0.53      0.55     31915
weighted avg       0.99      0.99      0.99     31915



Classification report for obscene:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     30200
           1       0.92      0.61      0.73      1715

    accuracy                           0.98     31915
   macro avg 