## Подключение диска и установка keras.tunes

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install -U keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


# Инициализация

In [None]:
%tensorflow_version 2.x
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, MaxPooling1D, Dropout, LSTM, Bidirectional, SpatialDropout1D, Flatten, Conv1D, GlobalMaxPooling1D, Dropout, LSTM, GRU, SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from kerastuner.tuners import BayesianOptimization
from tensorflow import keras
from keras_tuner import Objective
import pandas as pd

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
# Максимальное количество слов
num_words = 10000
# Максимальная длина комментария
max_comment_len = 50
#Число классов
num_classes = 6

# Загрузка тренировочных данных

In [None]:
train = pd.read_csv('drive/MyDrive/toxicCommentsDataSet/toxicCommentsTrain.csv')

In [None]:
#Получение комментариев
comments = train['comment_text']

In [None]:
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# Токенизация

In [None]:
tokenizer = Tokenizer(num_words=num_words)

In [None]:
#Обучение токенайзера
tokenizer.fit_on_texts(comments)

In [None]:
train_sequences = tokenizer.texts_to_sequences(comments)

In [None]:
x_train = pad_sequences(train_sequences, maxlen=max_comment_len)

# Оптимизация гиперпараметров

In [None]:
def build_model(hp):
    activation_choice = hp.Choice('activation', values=['sigmoid', 'tanh', 'relu', 'elu', 'selu'])
    optimizer_choice = hp.Choice('optimizer', values=['SGD', 'adam', 'adagrad', 'adadelta', 'rmsprop'])
    layers_rnn_num = hp.Int('layers_rnn_num', 0, 2)
    neurons_num = hp.Int('neurons_num', min_value=4, max_value=50, step=8)
    embedding_num = hp.Int('embedding_num', min_value=4, max_value=128, step=16)
    embeddings_regularizer = keras.regularizers.l2(
        hp.Choice('embeddings_regularizer', values=[1e-2, 1e-3, 1e-4], default=1e-3))
    kernel_regularizer = keras.regularizers.l2(hp.Choice('kernel_regularizer', values=[1e-2, 1e-3, 1e-4], default=1e-3))

    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=embedding_num, input_length=max_comment_len,
                        embeddings_regularizer=embeddings_regularizer))
    for i in range(layers_rnn_num):
        model.add(SimpleRNN(units=neurons_num, activation=activation_choice, kernel_regularizer=kernel_regularizer,return_sequences=True))
    model.add(SimpleRNN(units=neurons_num, activation=activation_choice, kernel_regularizer=kernel_regularizer))
    model.add(Dense(num_classes, activation='sigmoid'))

    model.compile(optimizer=optimizer_choice, loss='binary_crossentropy', metrics=[keras.metrics.AUC(name="auc"), "accuracy"])

    return model

In [None]:
tuner = BayesianOptimization(
    build_model,
    objective=Objective('val_auc', 'max'),
    max_trials=50,
    directory='rnn_toxic_dir'
)

In [None]:
tuner.search_space_summary()

Search space summary
Default search space size: 7
activation (Choice)
{'default': 'sigmoid', 'conditions': [], 'values': ['sigmoid', 'tanh', 'relu', 'elu', 'selu'], 'ordered': False}
optimizer (Choice)
{'default': 'SGD', 'conditions': [], 'values': ['SGD', 'adam', 'adagrad', 'adadelta', 'rmsprop'], 'ordered': False}
layers_rnn_num (Int)
{'default': None, 'conditions': [], 'min_value': 0, 'max_value': 2, 'step': 1, 'sampling': 'linear'}
neurons_num (Int)
{'default': None, 'conditions': [], 'min_value': 4, 'max_value': 50, 'step': 8, 'sampling': 'linear'}
embedding_num (Int)
{'default': None, 'conditions': [], 'min_value': 4, 'max_value': 128, 'step': 16, 'sampling': 'linear'}
embeddings_regularizer (Choice)
{'default': 0.001, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}
kernel_regularizer (Choice)
{'default': 0.001, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [28]:
tuner.search(x_train, y_train, batch_size=1024, epochs=5, validation_split=0.2)

Trial 50 Complete [00h 01m 42s]
val_auc: 0.9589354991912842

Best val_auc So Far: 0.9631543159484863
Total elapsed time: 01h 18m 30s


In [29]:
tuner.results_summary(num_trials=50)

Results summary
Results in rnn_toxic_dir/untitled_project
Showing 50 best trials
Objective(name="val_auc", direction="max")

Trial 32 summary
Hyperparameters:
activation: relu
optimizer: adam
layers_rnn_num: 2
neurons_num: 36
embedding_num: 68
embeddings_regularizer: 0.001
kernel_regularizer: 0.001
Score: 0.9631543159484863

Trial 30 summary
Hyperparameters:
activation: relu
optimizer: adam
layers_rnn_num: 2
neurons_num: 44
embedding_num: 68
embeddings_regularizer: 0.001
kernel_regularizer: 0.001
Score: 0.9627551436424255

Trial 11 summary
Hyperparameters:
activation: elu
optimizer: adam
layers_rnn_num: 1
neurons_num: 28
embedding_num: 52
embeddings_regularizer: 0.001
kernel_regularizer: 0.0001
Score: 0.9623773097991943

Trial 44 summary
Hyperparameters:
activation: relu
optimizer: rmsprop
layers_rnn_num: 1
neurons_num: 28
embedding_num: 20
embeddings_regularizer: 0.01
kernel_regularizer: 0.0001
Score: 0.9591561555862427

Trial 49 summary
Hyperparameters:
activation: relu
optimizer: rm

# Загрузка тестовых данных

In [30]:
test_comments = pd.read_csv('drive/MyDrive/toxicCommentsDataSet/toxicCommentsTest.csv')

In [31]:
test_labels = pd.read_csv('drive/MyDrive/toxicCommentsDataSet/toxicCommentsTestLabels.csv')

In [32]:
test_full = pd.merge(test_comments, test_labels, on='id')

In [33]:
test = test_full[test_full['toxic']!=-1]

# Токенизация

In [34]:
test_sequences = tokenizer.texts_to_sequences(test['comment_text'])

In [35]:
y_test = test[['toxic',	'severe_toxic',	'obscene', 'threat',	'insult',	'identity_hate']]

In [36]:
x_test = pad_sequences(test_sequences, maxlen=max_comment_len)

# Проверка лучшей модели

In [37]:
model = tuner.get_best_models(num_models=1)

In [38]:
model[0].evaluate(x_test, y_test)



[0.1153348758816719, 0.9521059989929199, 0.9976085424423218]

# Сохранение модели

In [None]:
model[0].save('drive/MyDrive/toxicCommentsDataSet/toxicCommentRNNModel.h5')