In [20]:
import numpy as np
from keras import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense, Bidirectional, GlobalMaxPool1D, LSTM

import prepare_data as prep
from prepare_data import DataPreprocessor
from importlib import reload

reload(prep)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<module 'prepare_data' from 'd:\\Projects\\toxic-comment-classification\\prepare_data.py'>

In [2]:
prepData = DataPreprocessor("jigsaw-toxic-comment-classification-challenge/train.csv")

prepData.load_data()
prepData.preprocess_data()
X, y = prepData.get_XY()

                      id                                       comment_text  \
0       0000997932d777bf  explanation\nwhy the edits made under my usern...   
1       000103f0d9cfb60f  d'aww! he matches this background colour i'm s...   
2       000113f07ec002fd  hey man, i'm really not trying to edit war. it...   
3       0001b41b1c6bb37e  "\nmore\ni can't make any real suggestions on ...   
4       0001d958c54c6e35  you, sir, are my hero. any chance you remember...   
...                  ...                                                ...   
159566  ffe987279560d7ff  ":::::and for the second time of asking, when ...   
159567  ffea4adeee384e90  you should be ashamed of yourself \n\nthat is ...   
159568  ffee36eab5c267c9  spitzer \n\numm, theres no actual article for ...   
159569  fff125370e4aaaf3  and it looks like it was actually you who put ...   
159570  fff46fc426af1f9a  "\nand ... i really don't think you understand...   

        toxic  severe_toxic  obscene  threat  insul

In [3]:
# Zmniana wartości klas w binarne

mlb = LabelBinarizer()
y_binary = mlb.fit_transform(y)

X_notoken = X

# Tokenizacja danych tekstowych
max_words = 20000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len)

# Podział metodą train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_binary, test_size=0.2, random_state=42)

print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)


X_train (127656, 150)
X_test (31915, 150)
y_train (127656, 6)
y_test (31915, 6)


In [4]:
# LR preparation

# Dodać walidację krzyżową do każdego modelu oraz testy statystyczne

from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'C': uniform(loc=0.01, scale=10),
    'penalty': ['l2']
}

random_search_LR = RandomizedSearchCV(
    estimator=LogisticRegression(solver='lbfgs', max_iter=1000, verbose=True),
    param_distributions=param_distributions,
    n_iter=4,
    cv=3,
    random_state=42,
    scoring='f1'
)

def LR_Training_CV(y_label, X_padded, tokenizer, k=5):
    # Perform parameter search on the entire dataset
    X_text = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_padded]
    
    tfidf_vec = TfidfVectorizer(max_df=0.7)
    X_vec = tfidf_vec.fit_transform(X_text)
    
    random_search_LR.fit(X_vec, y_label.values.ravel())
    best_params = random_search_LR.best_params_
    print('Best parameters from RandomizedSearchCV:', best_params)
    
    # Use the best parameters for cross-validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    all_predictions = []
    all_y_tests = []
    
    for train_index, test_index in kf.split(X_padded):
        X_train_LR, X_test_LR = X_padded[train_index], X_padded[test_index]
        y_train_LR, y_test_LR = y_label.iloc[train_index], y_label.iloc[test_index]
        
        X_train_LR = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_train_LR]
        X_test_LR = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_test_LR]
        
        tfidf_vec = TfidfVectorizer(max_df=0.7)
        X_train_LR_vec = tfidf_vec.fit_transform(X_train_LR)
        X_test_LR_vec = tfidf_vec.transform(X_test_LR)
        
        model = LogisticRegression(solver='lbfgs', max_iter=1000, verbose=True, **best_params)
        model.fit(X_train_LR_vec, y_train_LR.values.ravel())
        
        predictions = model.predict(X_test_LR_vec)
        
        all_predictions.extend(predictions)
        all_y_tests.extend(y_test_LR.values)
    
    print(confusion_matrix(all_y_tests, all_predictions))
    print(classification_report(all_y_tests, all_predictions))

In [5]:
# LogisticRegression

print("Logistic Regression Training with Cross-Validation")
print("Toxic comments:")
LR_Training_CV(y[["toxic"]], X_padded, tokenizer)
print("Severe toxic comments:")
LR_Training_CV(y[["severe_toxic"]], X_padded, tokenizer)
print("Obscene comments:")
LR_Training_CV(y[["obscene"]], X_padded, tokenizer)
print("Threat comments:")
LR_Training_CV(y[["threat"]], X_padded, tokenizer)
print("Insult comments:")
LR_Training_CV(y[["insult"]], X_padded, tokenizer)
print("Identity hate comments:")
LR_Training_CV(y[["identity_hate"]], X_padded, tokenizer)

Logistic Regression Training with Cross-Validation
Toxic comments:
Best parameters from RandomizedSearchCV: {'C': 7.3299394181140505, 'penalty': 'l2'}
[[142621   1656]
 [  4909  10385]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98    144277
           1       0.86      0.68      0.76     15294

    accuracy                           0.96    159571
   macro avg       0.91      0.83      0.87    159571
weighted avg       0.96      0.96      0.96    159571

Severe toxic comments:
Best parameters from RandomizedSearchCV: {'C': 9.51714306409916, 'penalty': 'l2'}
[[157524    452]
 [  1114    481]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    157976
           1       0.52      0.30      0.38      1595

    accuracy                           0.99    159571
   macro avg       0.75      0.65      0.69    159571
weighted avg       0.99      0.99      0.99    159571

Obscene comments:
Be

In [6]:
# NB Preparation

param_distributions = {
    'alpha': uniform(loc=0, scale=1),
    'fit_prior': [True, False]
}

random_search_NB = RandomizedSearchCV(
    estimator=MultinomialNB(),
    param_distributions=param_distributions,
    n_iter=4,
    cv=3,
    random_state=42,
    scoring='f1'
)


def NB_Training_CV(y_label, X_padded, tokenizer, k=5):
    # Perform parameter search on the entire dataset
    X_text = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_padded]

    tfidf_vec = TfidfVectorizer(max_df=0.7)
    X_vec = tfidf_vec.fit_transform(X_text)

    random_search_NB.fit(X_vec, y_label.values.ravel())
    best_params = random_search_NB.best_params_
    print('Best parameters from RandomizedSearchCV:', best_params)

    # Use the best parameters for cross-validation
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    all_predictions = []
    all_y_tests = []

    for train_index, test_index in kf.split(X_padded):
        X_train_NB, X_test_NB = X_padded[train_index], X_padded[test_index]
        y_train_NB, y_test_NB = y_label.iloc[train_index], y_label.iloc[test_index]

        X_train_NB = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_train_NB]
        X_test_NB = [' '.join(tokenizer.sequences_to_texts([comment])[0].split()) for comment in X_test_NB]

        tfidf_vec = TfidfVectorizer(max_df=0.7)
        X_train_NB_vec = tfidf_vec.fit_transform(X_train_NB)
        X_test_NB_vec = tfidf_vec.transform(X_test_NB)

        model = MultinomialNB(**best_params)
        model.fit(X_train_NB_vec, y_train_NB.values.ravel())

        predictions = model.predict(X_test_NB_vec)

        all_predictions.extend(predictions)
        all_y_tests.extend(y_test_NB.values)

    print(confusion_matrix(all_y_tests, all_predictions))
    print(classification_report(all_y_tests, all_predictions))


In [7]:
# Naive Bayes

print("Naive Bayes Training with Cross-Validation")
print("Toxic comments:")
NB_Training_CV(y[["toxic"]], X_padded, tokenizer)
print("Severe toxic comments:")
NB_Training_CV(y[["severe_toxic"]], X_padded, tokenizer)
print("Obscene comments:")
NB_Training_CV(y[["obscene"]], X_padded, tokenizer)
print("Threat comments:")
NB_Training_CV(y[["threat"]], X_padded, tokenizer)
print("Insult comments:")
NB_Training_CV(y[["insult"]], X_padded, tokenizer)
print("Identity hate comments:")
NB_Training_CV(y[["identity_hate"]], X_padded, tokenizer)

Naive Bayes Training with Cross-Validation
Toxic comments:
Best parameters from RandomizedSearchCV: {'alpha': 0.3745401188473625, 'fit_prior': True}
[[143447    830]
 [  7270   8024]]
              precision    recall  f1-score   support

           0       0.95      0.99      0.97    144277
           1       0.91      0.52      0.66     15294

    accuracy                           0.95    159571
   macro avg       0.93      0.76      0.82    159571
weighted avg       0.95      0.95      0.94    159571

Severe toxic comments:
Best parameters from RandomizedSearchCV: {'alpha': 0.1834347898661638, 'fit_prior': False}
[[146137  11839]
 [   122   1473]]
              precision    recall  f1-score   support

           0       1.00      0.93      0.96    157976
           1       0.11      0.92      0.20      1595

    accuracy                           0.93    159571
   macro avg       0.55      0.92      0.58    159571
weighted avg       0.99      0.93      0.95    159571

Obscene comme

In [8]:
import tensorflow as tf

def f1_score(y_true, y_pred):
    y_pred = tf.round(tf.clip_by_value(y_pred, 0, 1))
    true_positives = tf.reduce_sum(y_true * y_pred)
    false_positives = tf.reduce_sum((1 - y_true) * y_pred)
    false_negatives = tf.reduce_sum(y_true * (1 - y_pred))

    precision = true_positives / (true_positives + false_positives + tf.keras.backend.epsilon())
    recall = true_positives / (true_positives + false_negatives + tf.keras.backend.epsilon())
    
    f1_score = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())

    return f1_score

In [22]:
# Define the hyperparameter space
space = {
    'max_len': hp.choice('max_len', [100, 200, 300]),
    'embed_dim': hp.choice('embed_dim', [64, 128, 256]),
    'filters': hp.choice('filters', [64, 128, 256]),
    'kernel_size': hp.choice('kernel_size', [3, 5, 7]),
    'dense_units': hp.choice('dense_units', [64, 128, 256]),
    'dropout_rate': hp.uniform('dropout_rate', 0.2, 0.5),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'epochs': hp.choice('epochs', [3]),
}

def pad_sequences_custom(sequences, max_len):
    return np.array([np.pad(seq, (0, max_len - len(seq)), mode='constant')[:max_len] for seq in sequences])

def objective(params):
    # Cross-validation setup
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    val_scores = []
    
    # Define the CNN model
    inputs = Input(shape=(params['max_len'],))
    embedding = Embedding(input_dim=max_words, output_dim=params['embed_dim'], input_length=params['max_len'])(inputs)
    conv1 = Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu')(embedding)
    pool1 = MaxPooling1D(pool_size=2)(conv1)
    conv2 = Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], activation='relu')(pool1)
    pool2 = GlobalMaxPooling1D()(conv2)
    dense1 = Dense(params['dense_units'], activation='relu')(pool2)
    dropout = Dropout(params['dropout_rate'])(dense1)
    outputs = Dense(6, activation='sigmoid')(dropout)
    model = Model(inputs=inputs, outputs=outputs)
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    for train_index, val_index in kf.split(X_train):
        print(train_index)
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        # Adjust the input data to match max_len
        X_train_fold = pad_sequences_custom(X_train_fold, params['max_len'])
        X_val_fold = pad_sequences_custom(X_val_fold, params['max_len'])
        
        # Define callbacks
        checkpoint = ModelCheckpoint('best_model_cnn.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

        # Train the model
        history = model.fit(X_train_fold, y_train_fold, epochs=params['epochs'], batch_size=params['batch_size'], validation_data=(X_val_fold, y_val_fold), callbacks=[checkpoint, early_stop])

        # Evaluate the model
        val_preds = model.predict(X_val_fold)
        val_preds_binary = (val_preds > 0.5).astype(int)
        f1 = f1_score(y_val_fold, val_preds_binary, average='macro')
        val_scores.append(f1)

    # Calculate the average F1 score over all folds
    avg_f1 = np.mean(val_scores)
    
    # Return the negative average F1 score to minimize
    return {'loss': -avg_f1, 'status': STATUS_OK}

# Initialize Trials object to keep track of results
trials = Trials()

# Run hyperparameter optimization
best = fmin(objective, space, algo=tpe.rand.suggest, max_evals=2, trials=trials)

# Print the best hyperparameters
print('Best hyperparameters:', best)

[     1      2      4 ... 127652 127653 127655]      
Epoch 1/3                                            

[1m  1/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m29:57[0m 3s/step - accuracy: 0.5781 - loss: 0.6951
[1m  2/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:05[0m 280ms/step - accuracy: 0.6367 - loss: 0.6794
[1m  3/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:00[0m 273ms/step - accuracy: 0.6354 - loss: 0.6622
[1m  4/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:05[0m 280ms/step - accuracy: 0.6245 - loss: 0.6387
[1m  5/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:08[0m 286ms/step - accuracy: 0.6146 - loss: 0.6120
[1m  6/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:04[0m 280ms/step - accuracy: 0.6109 - loss: 0.5842
[1m  7/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:04[0m 281ms/step - accuracy: 0.6136 - loss: 0.5593
[1m  8/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:03[0m 280ms/step - accuracy: 0.6195 - loss: 0.5393
[1m  9/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:02[0m 27




Epoch 1/3                                                                         

[1m  1/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m26:17[0m 2s/step - accuracy: 0.1328 - loss: 0.6983
[1m  2/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:04[0m 98ms/step - accuracy: 0.1445 - loss: 0.6901
[1m  3/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:05[0m 99ms/step - accuracy: 0.1545 - loss: 0.6827
[1m  4/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:04[0m 98ms/step - accuracy: 0.1628 - loss: 0.6760
[1m  5/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:05[0m 99ms/step - accuracy: 0.1683 - loss: 0.6692
[1m  6/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:04[0m 98ms/step - accuracy: 0.1737 - loss: 0.6615
[1m  7/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:04[0m 98ms/step - accuracy: 0.1785 - loss: 0.6525
[1m  8/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:04[0m 97ms/step - accuracy: 0.1814 - loss: 0.6420
[1m  9/665[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:03[0m 97ms/step - accuracy: 0.1841 - los

In [None]:
# CNN
inputs = Input(shape=(max_len,))
embedding = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inputs)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
pool1 = MaxPooling1D(pool_size=2)(conv1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(pool1)
pool2 = GlobalMaxPooling1D()(conv2)
dense1 = Dense(128, activation='relu')(pool2)
dropout = Dropout(0.5)(dense1)
outputs = Dense(6, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1_score])
checkpoint = ModelCheckpoint('best_model_cnn.tf', monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_format='tf')
early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

history_cnn = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[checkpoint, early_stop])

evaluation_cnn = model.evaluate(X_test, y_test)
test_accuracy_cnn = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_cnn)

Epoch 1/10

KeyboardInterrupt: 

In [None]:
# LSTM

inp = Input(shape = (max_len, ))
layer = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inp)
layer = Bidirectional(LSTM(50, return_sequences = True, recurrent_dropout = 0.15))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dropout(0.2)(layer)
layer = Dense(50, activation = 'relu')(layer)
layer = Dropout(0.2)(layer)
layer = Dense(6, activation = 'sigmoid')(layer)
model = Model(inputs = inp, outputs = layer)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
#model.summary()

file_path = 'save_best_model_lstm.tf'
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, save_format='tf')
early_stop = EarlyStopping(monitor='val_loss', patience=1)

history_lstm = model.fit(X_train, y_train, batch_size = 512, epochs = 10, validation_split = 0.2, callbacks = [checkpoint, early_stop])

evaluation_lstm = model.evaluate(X_test, y_test)
test_accuracy_lstm = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_lstm)

In [14]:
# LSTM Parameter Optimization

space = {
    'max_len': hp.choice('max_len', [100, 200, 300]),
    'embed_dim': hp.choice('embed_dim', [64, 128, 256]),
    'lstm_units': hp.choice('lstm_units', [32, 64, 128]),
    'recurrent_dropout': hp.uniform('recurrent_dropout', 0.1, 0.3),
    'dense_units': hp.choice('dense_units', [32, 64, 128]),
    'dropout_rate': hp.uniform('dropout_rate', 0.1, 0.4),
    'batch_size': hp.choice('batch_size', [64, 128, 256]),
    'epochs': hp.choice('epochs', [10, 15, 20]),
}

def objective(params):
    inputs = Input(shape=(max_len,))
    layer = Embedding(input_dim=max_words, output_dim=params['embed_dim'], input_length=params['max_len'])(inputs)
    layer = Bidirectional(LSTM(params['lstm_units'], return_sequences=True, recurrent_dropout=params['recurrent_dropout']))(layer)
    layer = GlobalMaxPool1D()(layer)
    layer = Dropout(params['dropout_rate'])(layer)
    layer = Dense(params['dense_units'], activation='relu')(layer)
    layer = Dropout(params['dropout_rate'])(layer)
    layer = Dense(6, activation='sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1_score])

    file_path = 'save_best_model_lstm.keras'
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, save_format='tf')
    early_stop = EarlyStopping(monitor='val_loss', patience=1)

    history = model.fit(X_train, y_train, batch_size=params['batch_size'], epochs=params['epochs'], validation_split=0.2, callbacks=[checkpoint, early_stop])

    #val_loss, val_acc = model.evaluate(X_test, y_test)
    loss, accuracy, f1 = model.evaluate(X_test, y_test, verbose=0)

    return {'loss': -f1, 'status': STATUS_OK}

trials = Trials()

best = fmin(objective, space, algo=tpe.rand.suggest, max_evals=3, trials=trials)

print('Best hyperparameters:', best)

  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

job exception: ModelCheckpoint.__init__() got an unexpected keyword argument 'save_format'



  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]


TypeError: ModelCheckpoint.__init__() got an unexpected keyword argument 'save_format'