In [1]:
from keras import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense, Bidirectional, GlobalMaxPool1D, LSTM

from prepare_data import DataPreprocessor

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
prepData = DataPreprocessor("jigsaw-toxic-comment-classification-challenge/train.csv")

prepData.load_data()
prepData.preprocess_data()
X, y = prepData.get_XY()


                      id                                       comment_text  \
0       0000997932d777bf  explanation\nwhy the edits made under my usern...   
1       000103f0d9cfb60f  d'aww! he matches this background colour i'm s...   
2       000113f07ec002fd  hey man, i'm really not trying to edit war. it...   
3       0001b41b1c6bb37e  "\nmore\ni can't make any real suggestions on ...   
4       0001d958c54c6e35  you, sir, are my hero. any chance you remember...   
...                  ...                                                ...   
159566  ffe987279560d7ff  ":::::and for the second time of asking, when ...   
159567  ffea4adeee384e90  you should be ashamed of yourself \n\nthat is ...   
159568  ffee36eab5c267c9  spitzer \n\numm, theres no actual article for ...   
159569  fff125370e4aaaf3  and it looks like it was actually you who put ...   
159570  fff46fc426af1f9a  "\nand ... i really don't think you understand...   

        toxic  severe_toxic  obscene  threat  insul

In [3]:
# Zmniana wartości klas w binarne

mlb = LabelBinarizer()
y_binary = mlb.fit_transform(y)

X_notoken = X

# Tokenizacja danych tekstowych
max_words = 20000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len)

# Podział metodą train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.2, random_state=42)

print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)


X_train (127656, 150)
X_test (31915, 150)
y_train (127656, 6)
y_test (31915, 6)


In [39]:
# LR preparation

from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'C': uniform(loc=0.01, scale=10),  # Search C values between 0.01 and 10
    'penalty': ['l2']
}

random_search_LR = RandomizedSearchCV(
    estimator=LogisticRegression(solver='lbfgs', max_iter=1000, verbose=True),
    param_distributions=param_distributions,
    n_iter=4,  # Number of random parameter settings to try
    cv=3,  # Number of cross-validation folds
    random_state=42,
    scoring='f1'  # Optimize for F1 score
)

def LR_Training(y_label):
    X_train_LR, X_test_LR, y_train_LR, y_test_LR = train_test_split(X_padded, y_label, test_size=0.2, random_state=42)
    
    X_train_LR = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_train_LR]
    X_test_LR = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_test_LR]
    
    tfidf_vec = TfidfVectorizer(max_df=0.7)
    X_train_LR_vec = tfidf_vec.fit_transform(X_train_LR)
    X_test_LR_vec = tfidf_vec.transform(X_test_LR)
    
    random_search_LR.fit(X_train_LR_vec, y_train_LR.values.ravel())
    
    best_estimator = random_search_LR.best_estimator_
    print('Best parameters:', best_estimator.get_params())
    
    predictions = best_estimator.predict(X_test_LR_vec)
    print(confusion_matrix(y_test_LR, predictions))
    print(classification_report(y_test_LR, predictions))

In [41]:
# LogisticRegression

print("Logistic Regression Training")
print("Toxic comments:")
LR_Training(y[["toxic"]])
print("Severe toxic comments:")
LR_Training(y[["severe_toxic"]])
print("Obscene comments:")
LR_Training(y[["obscene"]])
print("Threat comments:")
LR_Training(y[["threat"]])
print("Insult comments:")
LR_Training(y[["insult"]])
print("Identity hate comments:")
LR_Training(y[["identity_hate"]])

Logistic Regression Training
Toxic comments:
Best parameters: {'C': 9.51714306409916, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': True, 'warm_start': False}
[[28528   331]
 [  962  2094]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     28859
           1       0.86      0.69      0.76      3056

    accuracy                           0.96     31915
   macro avg       0.92      0.84      0.87     31915
weighted avg       0.96      0.96      0.96     31915

Severe toxic comments:
Best parameters: {'C': 9.51714306409916, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol'

In [44]:
# NB Preparation

param_distributions = {
    'alpha': uniform(loc=0, scale=1),  # Search alpha values between 0 and 1
    'fit_prior': [True, False]  # Whether to learn class prior probabilities or not
}

random_search_NB = RandomizedSearchCV(
    estimator=MultinomialNB(),
    param_distributions=param_distributions,
    n_iter=4,  # Number of random parameter settings to try
    cv=3,  # Number of cross-validation folds
    random_state=42,
    scoring='f1'  # Optimize for F1 score
)

def NB_Training(y_label):
    X_train_NB, X_test_NB, y_train_NB, y_test_NB = train_test_split(X_padded, y_label, test_size=0.2, random_state=42)
    
    X_train_NB = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_train_NB]
    X_test_NB = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_test_NB]
    
    tfidf_vec = TfidfVectorizer(max_df=0.7)
    X_train_NB_vec = tfidf_vec.fit_transform(X_train_NB)
    X_test_NB_vec = tfidf_vec.transform(X_test_NB)
    
    random_search_NB.fit(X_train_NB_vec, y_train_NB.values.ravel())
    
    best_estimator = random_search_NB.best_estimator_
    print('Best parameters:', best_estimator.get_params())
    
    predictions = best_estimator.predict(X_test_NB_vec)
    print(confusion_matrix(y_test_NB, predictions))
    print(classification_report(y_test_NB, predictions))

In [45]:
# Naive Bayes

print("Naive Bayes Training")
print("Toxic comments:")
NB_Training(y[["toxic"]])
print("Severe toxic comments:")
NB_Training(y[["severe_toxic"]])
print("Obscene comments:")
NB_Training(y[["obscene"]])
print("Threat comments:")
NB_Training(y[["threat"]])
print("Insult comments:")
NB_Training(y[["insult"]])
print("Identity hate comments:")
NB_Training(y[["identity_hate"]])

Naive Bayes Training
Toxic comments:
Best parameters: {'C': 9.51714306409916, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': True, 'warm_start': False}
[[28854     5]
 [ 2947   109]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     28859
           1       0.96      0.04      0.07      3056

    accuracy                           0.91     31915
   macro avg       0.93      0.52      0.51     31915
weighted avg       0.91      0.91      0.87     31915

Severe toxic comments:
Best parameters: {'C': 9.51714306409916, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001

In [6]:
# CNN Parameter Optimization

space = {
    'max_len': hp.choice('max_len', [100, 200, 300]),
    'embed_dim': hp.choice('embed_dim', [64, 128, 256]),
    'filters': hp.choice('filters', [64, 128, 256]),
    'kernel_size': hp.choice('kernel_size', [3, 5, 7]),
    'dense_units': hp.choice('dense_units', [64, 128, 256]),
    'dropout_rate': hp.uniform('dropout_rate', 0.2, 0.5),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'epochs': hp.choice('epochs', [10, 15, 20]),
}

def objective(params):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(input_dim=max_words, output_dim=params['embed_dim'], input_length=params['max_len'])(inputs)
    conv1 = Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu')(embedding)
    pool1 = MaxPooling1D(pool_size=2)(conv1)
    conv2 = Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu')(pool1)
    pool2 = GlobalMaxPooling1D()(conv2)
    dense1 = Dense(params['dense_units'], activation='relu')(pool2)
    dropout = Dropout(params['dropout_rate'])(dense1)
    outputs = Dense(6, activation='sigmoid')(dropout)
    model = Model(inputs=inputs, outputs=outputs)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    checkpoint = ModelCheckpoint('best_model_cnn.tf', monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_format='tf')
    early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

    history = model.fit(X_train, y_train, epochs=params['epochs'], batch_size=params['batch_size'], validation_split=0.2, callbacks=[checkpoint, early_stop])

    loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)

    # Return the validation loss as the objective to be minimized
    return {'loss': f1_score, 'status': STATUS_OK}

trials = Trials()

best = fmin(objective, space, algo=tpe.rand.suggest, max_evals=3, trials=trials)

# Print the best hyperparameters
print('Best hyperparameters:', best)

Epoch 1/15                                           

   1/1596 [..............................] - ETA: 27:26 - loss: 0.6834 - accuracy: 0.0000e+00
   2/1596 [..............................] - ETA: 2:13 - loss: 0.6712 - accuracy: 0.0156     
   3/1596 [..............................] - ETA: 2:14 - loss: 0.6581 - accuracy: 0.0104
   4/1596 [..............................] - ETA: 2:14 - loss: 0.6433 - accuracy: 0.0078
   5/1596 [..............................] - ETA: 2:19 - loss: 0.6300 - accuracy: 0.0094
   6/1596 [..............................] - ETA: 2:18 - loss: 0.6129 - accuracy: 0.0104
   7/1596 [..............................] - ETA: 2:17 - loss: 0.5941 - accuracy: 0.0112
   8/1596 [..............................] - ETA: 2:18 - loss: 0.5733 - accuracy: 0.0117
   9/1596 [..............................] - ETA: 2:18 - loss: 0.5483 - accuracy: 0.0122
  10/1596 [..............................] - ETA: 2:16 - loss: 0.5270 - accuracy: 0.0125
  11/1596 [..............................] - 

INFO:tensorflow:Assets written to: best_model_cnn.tf\assets




Epoch 2/15                                           

   1/1596 [..............................] - ETA: 2:02 - loss: 0.0692 - accuracy: 0.9844
   2/1596 [..............................] - ETA: 2:05 - loss: 0.0398 - accuracy: 0.9688
   3/1596 [..............................] - ETA: 2:09 - loss: 0.0435 - accuracy: 0.9688
   4/1596 [..............................] - ETA: 2:08 - loss: 0.0520 - accuracy: 0.9688
   5/1596 [..............................] - ETA: 2:08 - loss: 0.0500 - accuracy: 0.9719
   6/1596 [..............................] - ETA: 2:06 - loss: 0.0483 - accuracy: 0.9740
   7/1596 [..............................] - ETA: 2:05 - loss: 0.0460 - accuracy: 0.9777
   8/1596 [..............................] - ETA: 2:04 - loss: 0.0457 - accuracy: 0.9805
   9/1596 [..............................] - ETA: 2:05 - loss: 0.0469 - accuracy: 0.9809
  10/1596 [..............................] - ETA: 2:04 - loss: 0.0465 - accuracy: 0.9828
  11/1596 [..............................] - ETA: 2:05

INFO:tensorflow:Assets written to: best_model_cnn.tf\assets




Epoch 3/15                                           

   1/1596 [..............................] - ETA: 2:01 - loss: 0.0292 - accuracy: 1.0000
   2/1596 [..............................] - ETA: 2:01 - loss: 0.0329 - accuracy: 1.0000
   3/1596 [..............................] - ETA: 2:01 - loss: 0.0298 - accuracy: 1.0000
   4/1596 [..............................] - ETA: 2:05 - loss: 0.0329 - accuracy: 0.9961
   5/1596 [..............................] - ETA: 2:04 - loss: 0.0362 - accuracy: 0.9937
   6/1596 [..............................] - ETA: 2:04 - loss: 0.0337 - accuracy: 0.9922
   7/1596 [..............................] - ETA: 2:04 - loss: 0.0337 - accuracy: 0.9933
   8/1596 [..............................] - ETA: 2:03 - loss: 0.0345 - accuracy: 0.9902
   9/1596 [..............................] - ETA: 2:03 - loss: 0.0337 - accuracy: 0.9913
  10/1596 [..............................] - ETA: 2:03 - loss: 0.0323 - accuracy: 0.9922
  11/1596 [..............................] - ETA: 2:04

ERROR:hyperopt.fmin:job exception: not enough values to unpack (expected 5, got 2)



  0%|          | 0/3 [09:04<?, ?trial/s, best loss=?]


ValueError: not enough values to unpack (expected 5, got 2)

In [4]:
# CNN
inputs = Input(shape=(max_len,))
embedding = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inputs)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
pool1 = MaxPooling1D(pool_size=2)(conv1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(pool1)
pool2 = GlobalMaxPooling1D()(conv2)
dense1 = Dense(128, activation='relu')(pool2)
dropout = Dropout(0.5)(dense1)
outputs = Dense(6, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

checkpoint = ModelCheckpoint('best_model_cnn.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

history_cnn = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[checkpoint, early_stop])

# Evaluate the model on the testing set
evaluation_cnn = model.evaluate(X_test, y_test)
test_accuracy_cnn = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_cnn)

Epoch 1/10




[1m797/798[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - accuracy: 0.6754 - loss: 0.1451
Epoch 1: val_loss improved from inf to 0.05243, saving model to best_model_cnn.keras
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 34ms/step - accuracy: 0.6758 - loss: 0.1450 - val_accuracy: 0.9943 - val_loss: 0.0524
Epoch 2/10
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9891 - loss: 0.0504
Epoch 2: val_loss improved from 0.05243 to 0.05160, saving model to best_model_cnn.keras
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.9891 - loss: 0.0504 - val_accuracy: 0.9943 - val_loss: 0.0516
Epoch 3/10
[1m797/798[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - accuracy: 0.9867 - loss: 0.0409
Epoch 3: val_loss did not improve from 0.05160
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 32ms/step - accuracy: 0.9867 - loss: 0.0409 - val_accu

In [None]:
# LSTM

inp = Input(shape = (max_len, ))
layer = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inp)
layer = Bidirectional(LSTM(50, return_sequences = True, recurrent_dropout = 0.15))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dropout(0.2)(layer)
layer = Dense(50, activation = 'relu')(layer)
layer = Dropout(0.2)(layer)
layer = Dense(6, activation = 'sigmoid')(layer)
model = Model(inputs = inp, outputs = layer)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
#model.summary()

file_path = 'save_best_model_lstm.keras'
checkpoint = ModelCheckpoint(file_path, monitor = 'val_loss', verbose = 1, save_best_only=True)
early_stop = EarlyStopping(monitor = 'val_loss', patience = 1)

history_lstm = model.fit(X_train, y_train, batch_size = 512, epochs = 10, validation_split = 0.2, callbacks = [checkpoint, early_stop])

# Evaluate the model on the testing set
evaluation_lstm = model.evaluate(X_test, y_test)
test_accuracy_lstm = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_lstm)

In [None]:
# LSTM Parameter Optimization

space = {
    'max_len': hp.choice('max_len', [100, 200, 300]),
    'embed_dim': hp.choice('embed_dim', [64, 128, 256]),
    'lstm_units': hp.choice('lstm_units', [32, 64, 128]),
    'recurrent_dropout': hp.uniform('recurrent_dropout', 0.1, 0.3),
    'dense_units': hp.choice('dense_units', [32, 64, 128]),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.4),
    'batch_size': hp.choice('batch_size', [64, 128, 256]),
    'epochs': hp.choice('epochs', [10, 15, 20]),
}

def objective(params):
    inputs = Input(shape=(max_len,))
    layer = Embedding(input_dim=max_words, output_dim=params['embed_dim'], input_length=params['max_len'])(inputs)
    layer = Bidirectional(LSTM(params['lstm_units'], return_sequences=True, recurrent_dropout=params['recurrent_dropout']))(layer)
    layer = GlobalMaxPool1D()(layer)
    layer = Dropout(params['dropout_rate'])(layer)
    layer = Dense(params['dense_units'], activation='relu')(layer)
    layer = Dropout(params['dropout_rate'])(layer)
    layer = Dense(6, activation='sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    file_path = 'save_best_model_lstm.tf'
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, save_format='tf')
    early_stop = EarlyStopping(monitor='val_loss', patience=1)

    history = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], validation_split=0.2, callbacks=[checkpoint, early_stop])

    #val_loss, val_acc = model.evaluate(X_test, y_test)
    loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)

    # Return the validation loss as the objective to be minimized
    return {'loss': f1_score, 'status': STATUS_OK}

trials = Trials()

best = fmin(objective, space, algo=tpe.rand.suggest, max_evals=3, trials=trials)

print('Best hyperparameters:', best)