In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from nltk import word_tokenize, WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences





In [2]:
# Load data
train_data = pd.read_csv("jigsaw-toxic-comment-classification-challenge/train.csv")
train_data["comment_text"] = train_data["comment_text"].str.lower()

# Preprocessing data
def cleaning(data):
    clean_column = re.sub('<.*?>', ' ', str(data))
    clean_column = re.sub('[^a-zA-Z0-9.]+', ' ', clean_column)
    tokenized_column = word_tokenize(clean_column)
    return tokenized_column

train_data["cleaned"] = train_data["comment_text"].apply(cleaning)

# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatizing(data):
    lemmatized_list = [lemmatizer.lemmatize(word) for word in data]
    return lemmatized_list

train_data["lemmatized"] = train_data["cleaned"].apply(lemmatizing)
train_data["comment_text"] = train_data["lemmatized"].apply(lambda x: ' '.join(x))

X = train_data["comment_text"]
y = train_data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [5]:
# Label Binarizer
mlb = LabelBinarizer()
y = mlb.fit_transform(y)

# Tokenize text data
max_words = 20000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)




In [7]:
# Define the CNN model
inputs = Input(shape=(max_len,))
embedding = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inputs)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
pool1 = MaxPooling1D(pool_size=2)(conv1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(pool1)
pool2 = GlobalMaxPooling1D()(conv2)
dense1 = Dense(128, activation='relu')(pool2)
dropout = Dropout(0.5)(dense1)
outputs = Dense(6, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define callbacks
checkpoint = ModelCheckpoint('best_model_cnn.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[checkpoint, early_stop])

# Evaluate the model on the testing set
evaluation = model.evaluate(X_test, y_test)
test_accuracy = evaluation[1]
print("Test Accuracy:", test_accuracy)

Epoch 1/10
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.6388 - loss: 0.1453
Epoch 1: val_loss improved from inf to 0.05223, saving model to best_model.keras
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 32ms/step - accuracy: 0.6390 - loss: 0.1452 - val_accuracy: 0.9943 - val_loss: 0.0522
Epoch 2/10
[1m797/798[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - accuracy: 0.9822 - loss: 0.0489
Epoch 2: val_loss did not improve from 0.05223
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 32ms/step - accuracy: 0.9822 - loss: 0.0489 - val_accuracy: 0.9943 - val_loss: 0.0529
Epoch 3/10
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9862 - loss: 0.0415
Epoch 3: val_loss did not improve from 0.05223
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.9862 - loss: 0.0415 - val_accuracy: 0.9943 - val_loss: 0.0590
Epo

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

In [None]:


# Output results to file
with open("output_cnn.txt", "w") as f:
    print("CNN Cross-Validation Accuracies:", file=f)
    for i, acc in enumerate(cnn_accuracies, 1):
        print(f"Fold {i} Accuracy: {acc}", file=f)
    print(f"\nAverage CNN Accuracy: {np.mean(cnn_accuracies)}", file=f)
