In [31]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split
from nltk import word_tokenize, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense, Bidirectional, GlobalMaxPool1D, LSTM

from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

from prepare_data import DataPreprocessor



In [2]:
prepData = DataPreprocessor("jigsaw-toxic-comment-classification-challenge/train.csv")

prepData.load_data()
prepData.preprocess_data()
X, y = prepData.get_XY()


                      id                                       comment_text  \
0       0000997932d777bf  explanation\nwhy the edits made under my usern...   
1       000103f0d9cfb60f  d'aww! he matches this background colour i'm s...   
2       000113f07ec002fd  hey man, i'm really not trying to edit war. it...   
3       0001b41b1c6bb37e  "\nmore\ni can't make any real suggestions on ...   
4       0001d958c54c6e35  you, sir, are my hero. any chance you remember...   
...                  ...                                                ...   
159566  ffe987279560d7ff  ":::::and for the second time of asking, when ...   
159567  ffea4adeee384e90  you should be ashamed of yourself \n\nthat is ...   
159568  ffee36eab5c267c9  spitzer \n\numm, theres no actual article for ...   
159569  fff125370e4aaaf3  and it looks like it was actually you who put ...   
159570  fff46fc426af1f9a  "\nand ... i really don't think you understand...   

        toxic  severe_toxic  obscene  threat  insul

In [11]:
# Zmniana wartości klas w binarne

mlb = LabelBinarizer()
y_binary = mlb.fit_transform(y)

X_notoken = X

# Tokenizacja danych tekstowych
max_words = 20000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len)

# Podział metodą train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.2, random_state=42)

print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)


X_train (127656, 150)
X_test (31915, 150)
y_train (127656, 6)
y_test (31915, 6)


In [35]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# LogisticRegression - toxic

param_distributions = {
    'C': uniform(loc=0.01, scale=10),  # Search C values between 0.01 and 10
    'penalty': ['l2']  # Search over l1 and l2 regularization
}

# Create a random search object
random_search = RandomizedSearchCV(
    estimator=LogisticRegression(solver='lbfgs', max_iter=1000, verbose=True),
    param_distributions=param_distributions,
    n_iter=4,  # Number of random parameter settings to try
    cv=3,  # Number of cross-validation folds
    random_state=42,
    scoring='f1'  # Optimize for F1 score
)



X_train_tx, X_test_tx, y_train_tx, y_test_tx = train_test_split(X_padded, y[["toxic"]], test_size=0.2, random_state=42)



# Convert tokenized sequences to lists of strings
X_train_tx = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_train_tx]
X_test_tx = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_test_tx]

tfidf_vec = TfidfVectorizer(max_df=0.7)
X_train_vec = tfidf_vec.fit_transform(X_train_tx)
X_test_vec = tfidf_vec.transform(X_test_tx)

# Fit the random search object
random_search.fit(X_train_vec, y_train_tx.values.ravel())

# Get the best estimator and its parameters
best_estimator = random_search.best_estimator_
print('Best parameters:', best_estimator.get_params())

# log_toxic = LogisticRegression(solver='lbfgs', max_iter=1000, verbose=True)
# log_toxic.fit(X_train_vec,y_train_tx.values.ravel())

# Make predictions on the test set
predictions = best_estimator.predict(X_test_vec)
print(confusion_matrix(y_test_tx, predictions))
print(classification_report(y_test_tx, predictions))

Best parameters: {'C': 9.51714306409916, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': True, 'warm_start': False}
[[28528   331]
 [  962  2094]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     28859
           1       0.86      0.69      0.76      3056

    accuracy                           0.96     31915
   macro avg       0.92      0.84      0.87     31915
weighted avg       0.96      0.96      0.96     31915


In [27]:
# LogisticRegression - severe toxic

X_train_stx, X_test_stx, y_train_stx, y_test_stx = train_test_split(X_padded, y[["severe_toxic"]], test_size=0.2, random_state=42)

# Convert tokenized sequences to lists of strings
X_train_stx = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_train_stx]
X_test_stx = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_test_stx]

tfidf_vec = TfidfVectorizer(max_df=0.7)
X_train_vec = tfidf_vec.fit_transform(X_train_stx)
X_test_vec = tfidf_vec.transform(X_test_stx)

log_toxic = LogisticRegression(solver='lbfgs', max_iter=1000, verbose=True)
log_toxic.fit(X_train_vec,y_train_stx.values.ravel())

predictions = log_toxic.predict(X_test_vec)
print(confusion_matrix(y_test_stx,predictions))
print(classification_report(y_test_stx,predictions))

[[31534    60]
 [  237    84]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     31594
           1       0.58      0.26      0.36       321

    accuracy                           0.99     31915
   macro avg       0.79      0.63      0.68     31915
weighted avg       0.99      0.99      0.99     31915


In [28]:
# LogisticRegression - obscene

X_train_obs, X_test_obs, y_train_obs, y_test_obs = train_test_split(X_padded, y[["toxic"]], test_size=0.2, random_state=42)

# Convert tokenized sequences to lists of strings
X_train_obs = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_train_obs]
X_test_obs = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_test_obs]

tfidf_vec = TfidfVectorizer(max_df=0.7)
X_train_vec = tfidf_vec.fit_transform(X_train_obs)
X_test_vec = tfidf_vec.transform(X_test_obs)

log_toxic = LogisticRegression(solver='lbfgs', max_iter=1000, verbose=True)
log_toxic.fit(X_train_vec,y_train_obs.values.ravel())

predictions = log_toxic.predict(X_test_vec)
print(confusion_matrix(y_test_obs,predictions))
print(classification_report(y_test_obs,predictions))

[[28689   170]
 [ 1210  1846]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28859
           1       0.92      0.60      0.73      3056

    accuracy                           0.96     31915
   macro avg       0.94      0.80      0.85     31915
weighted avg       0.96      0.96      0.95     31915


In [32]:
# Naive Bayes

naive_bayes = MultinomialNB().fit(X_train_vec, y_train_tx.values.ravel())
predictions = naive_bayes.predict(X_test_vec)
print(confusion_matrix(y_test_tx,predictions))
print(classification_report(y_test_tx,predictions))

[[28760    99]
 [ 1627  1429]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     28859
           1       0.94      0.47      0.62      3056

    accuracy                           0.95     31915
   macro avg       0.94      0.73      0.80     31915
weighted avg       0.95      0.95      0.94     31915


In [36]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Define the hyperparameter search space
space = {
    'max_len': hp.choice('max_len', [100, 200, 300]),
    'embed_dim': hp.choice('embed_dim', [64, 128, 256]),
    'filters': hp.choice('filters', [64, 128, 256]),
    'kernel_size': hp.choice('kernel_size', [3, 5, 7]),
    'dense_units': hp.choice('dense_units', [64, 128, 256]),
    'dropout_rate': hp.uniform('dropout_rate', 0.2, 0.5),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'epochs': hp.choice('epochs', [10, 15, 20]),
}

# Define the objective function to be minimized
def objective(params):
    # Create the model
    inputs = Input(shape=(max_len,))
    embedding = Embedding(input_dim=max_words, output_dim=params['embed_dim'], input_length=params['max_len'])(inputs)
    conv1 = Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu')(embedding)
    pool1 = MaxPooling1D(pool_size=2)(conv1)
    conv2 = Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu')(pool1)
    pool2 = GlobalMaxPooling1D()(conv2)
    dense1 = Dense(params['dense_units'], activation='relu')(pool2)
    dropout = Dropout(params['dropout_rate'])(dense1)
    outputs = Dense(6, activation='sigmoid')(dropout)
    model = Model(inputs=inputs, outputs=outputs)

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Define callbacks
    checkpoint = ModelCheckpoint('best_model_cnn.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

    # Train the model
    history = model.fit(X_train, y_train, epochs=params['epochs'], batch_size=params['batch_size'], validation_split=0.2, callbacks=[checkpoint, early_stop])

    # Evaluate the model on the validation set
    val_loss, val_acc = model.evaluate(X_test, y_test)

    # Return the validation loss as the objective to be minimized
    return {'loss': val_loss, 'status': STATUS_OK}

# Initialize trials object
trials = Trials()

# Perform random search
best = fmin(objective, space, algo=tpe.rand.suggest, max_evals=3, trials=trials)

# Print the best hyperparameters
print('Best hyperparameters:', best)

Epoch 1/10                                           

   1/3192 [..............................] - ETA: 2:01:12 - loss: 0.6920 - accuracy: 0.0000e+00
   2/3192 [..............................] - ETA: 3:52 - loss: 0.6862 - accuracy: 0.0000e+00   
   3/3192 [..............................] - ETA: 3:25 - loss: 0.6806 - accuracy: 0.0000e+00
   5/3192 [..............................] - ETA: 2:43 - loss: 0.6679 - accuracy: 0.0000e+00
   7/3192 [..............................] - ETA: 2:28 - loss: 0.6484 - accuracy: 0.0000e+00
   9/3192 [..............................] - ETA: 2:19 - loss: 0.6224 - accuracy: 0.0035    
  11/3192 [..............................] - ETA: 2:16 - loss: 0.5853 - accuracy: 0.0028
  13/3192 [..............................] - ETA: 2:12 - loss: 0.5434 - accuracy: 0.0024
  15/3192 [..............................] - ETA: 2:09 - loss: 0.5153 - accuracy: 0.0063
  17/3192 [..............................] - ETA: 2:07 - loss: 0.4789 - accuracy: 0.0074
  19/3192 [..............

In [4]:
# CNN
inputs = Input(shape=(max_len,))
embedding = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inputs)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
pool1 = MaxPooling1D(pool_size=2)(conv1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(pool1)
pool2 = GlobalMaxPooling1D()(conv2)
dense1 = Dense(128, activation='relu')(pool2)
dropout = Dropout(0.5)(dense1)
outputs = Dense(6, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define callbacks
checkpoint = ModelCheckpoint('best_model_cnn.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

# Train the model
history_cnn = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[checkpoint, early_stop])

# Evaluate the model on the testing set
evaluation_cnn = model.evaluate(X_test, y_test)
test_accuracy_cnn = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_cnn)

Epoch 1/10




[1m797/798[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - accuracy: 0.6754 - loss: 0.1451
Epoch 1: val_loss improved from inf to 0.05243, saving model to best_model_cnn.keras
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 34ms/step - accuracy: 0.6758 - loss: 0.1450 - val_accuracy: 0.9943 - val_loss: 0.0524
Epoch 2/10
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9891 - loss: 0.0504
Epoch 2: val_loss improved from 0.05243 to 0.05160, saving model to best_model_cnn.keras
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.9891 - loss: 0.0504 - val_accuracy: 0.9943 - val_loss: 0.0516
Epoch 3/10
[1m797/798[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - accuracy: 0.9867 - loss: 0.0409
Epoch 3: val_loss did not improve from 0.05160
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 32ms/step - accuracy: 0.9867 - loss: 0.0409 - val_accu

In [7]:
print(y_train.shape)

(127656, 6)


In [8]:
inp = Input(shape = (max_len, ))
layer = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inp)
layer = Bidirectional(LSTM(50, return_sequences = True, recurrent_dropout = 0.15))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dropout(0.2)(layer)
layer = Dense(50, activation = 'relu')(layer)
layer = Dropout(0.2)(layer)
layer = Dense(6, activation = 'sigmoid')(layer)
model = Model(inputs = inp, outputs = layer)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
#model.summary()

file_path = 'save_best_model_lstm.keras'
checkpoint = ModelCheckpoint(file_path, monitor = 'val_loss', verbose = 1, save_best_only=True)
early_stop = EarlyStopping(monitor = 'val_loss', patience = 1)

history_lstm = model.fit(X_train, y_train, batch_size = 512, epochs = 10, validation_split = 0.2, callbacks = [checkpoint, early_stop])

# Evaluate the model on the testing set
evaluation_lstm = model.evaluate(X_test, y_test)
test_accuracy_lstm = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_lstm)





Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 254ms/step - accuracy: 0.4203 - loss: 0.2584
Epoch 1: val_loss improved from inf to 0.05852, saving model to save_best_model_lstm.keras
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 273ms/step - accuracy: 0.4212 - loss: 0.2578 - val_accuracy: 0.9943 - val_loss: 0.0585
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - accuracy: 0.8603 - loss: 0.0590
Epoch 2: val_loss improved from 0.05852 to 0.04991, saving model to save_best_model_lstm.keras
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 267ms/step - accuracy: 0.8604 - loss: 0.0590 - val_accuracy: 0.9943 - val_loss: 0.0499
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - accuracy: 0.9052 - loss: 0.0485
Epoch 3: val_loss improved from 0.04991 to 0.04937, saving model to save_best_model_lstm.keras
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [33]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dropout, Dense
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Define the hyperparameter search space
space = {
    'max_len': hp.choice('max_len', [100, 200, 300]),
    'embed_dim': hp.choice('embed_dim', [64, 128, 256]),
    'lstm_units': hp.choice('lstm_units', [32, 64, 128]),
    'recurrent_dropout': hp.uniform('recurrent_dropout', 0.1, 0.3),
    'dense_units': hp.choice('dense_units', [32, 64, 128]),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.4),
    'batch_size': hp.choice('batch_size', [64, 128, 256]),
    'epochs': hp.choice('epochs', [10, 15, 20]),
}

# Define the objective function to be minimized
def objective(params):
    inputs = Input(shape=(max_len,))
    layer = Embedding(input_dim=max_words, output_dim=params['embed_dim'], input_length=params['max_len'])(inputs)
    layer = Bidirectional(LSTM(params['lstm_units'], return_sequences=True, recurrent_dropout=params['recurrent_dropout']))(layer)
    layer = GlobalMaxPool1D()(layer)
    layer = Dropout(params['dropout_rate'])(layer)
    layer = Dense(params['dense_units'], activation='relu')(layer)
    layer = Dropout(params['dropout_rate'])(layer)
    layer = Dense(6, activation='sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    file_path = 'save_best_model_lstm.keras'
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True)
    early_stop = EarlyStopping(monitor='val_loss', patience=1)

    history = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], validation_split=0.2, callbacks=[checkpoint, early_stop])

    val_loss, val_acc = model.evaluate(X_train, y_test)

    return {'loss': val_loss, 'status': STATUS_OK}

# Initialize trials object
trials = Trials()

# Perform random search
best = fmin(objective, space, algo=tpe.rand.suggest, max_evals=3, trials=trials)

# Print the best hyperparameters
print('Best hyperparameters:', best)

Epoch 1/20                                           

  1/399 [..............................] - ETA: 45:46 - loss: 0.6953 - accuracy: 0.0039
  2/399 [..............................] - ETA: 9:48 - loss: 0.6912 - accuracy: 0.0020 
  3/399 [..............................] - ETA: 9:42 - loss: 0.6871 - accuracy: 0.0013
  4/399 [..............................] - ETA: 9:35 - loss: 0.6834 - accuracy: 9.7656e-04
  5/399 [..............................] - ETA: 9:26 - loss: 0.6791 - accuracy: 7.8125e-04
  6/399 [..............................] - ETA: 9:19 - loss: 0.6749 - accuracy: 0.0013    
  7/399 [..............................] - ETA: 9:16 - loss: 0.6707 - accuracy: 0.0017
  8/399 [..............................] - ETA: 9:13 - loss: 0.6661 - accuracy: 0.0015
  9/399 [..............................] - ETA: 9:07 - loss: 0.6609 - accuracy: 0.0013
 10/399 [..............................] - ETA: 9:02 - loss: 0.6548 - accuracy: 0.0012
 11/399 [..............................] - ETA: 9:03 - loss: 

KeyboardInterrupt: 