Это вариация LSTM, которая использовалась для получения одних из топ (~97%) результатов для задачи toxic comment classification. Сама архитектура несложная, но довольно заморочены эмбеддинги (плюс понятно она не 1:1, т к у меня свои ad-hoc фичи).

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.3.0


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
os.chdir('drive/My Drive/lab_1')

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from utils import train_dev_test, plot_train_acc, plot_train_loss, classifier_out

In [6]:
train = pd.read_csv('data/train_non_lemmatized.csv')
test = pd.read_csv('data/test_non_lemmatized.csv')
train_features = pd.read_csv('preproc_files/train_features_16.csv')
test_features = pd.read_csv('preproc_files/test_features_16.csv')

In [7]:
y = train['target']
X_f = normalize(train_features)
X_test_f = normalize(test_features)

In [8]:
# Покрываем наш словарь целиком
dict_size = 352514
# Пусть будет 900 слов максимум
max_words = 900
# Размер twitter-glove эмбеддингов - 500
embed_dim = 500

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
tokenizer = Tokenizer(num_words=dict_size, lower=True)

In [11]:
tokenizer.fit_on_texts(train['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 352514 unique tokens.


In [12]:
X = tokenizer.texts_to_sequences(train['text'].values)
X = pad_sequences(X, maxlen=max_words)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (48000, 900)


In [13]:
X_train_e, X_val_e, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state=42)

In [14]:
X_train_f, X_val_f, _, _ = train_test_split(X_f, y, test_size = 0.1, random_state=42)

#### Pretrained block

Здесь используются комбинированно уже привычные Glove на твиттере и Fasttext эмбеддинги. Сюда можно было бы ещё к каждому слову прикрутить какие-нибудь POS-фичи, или sentiment, или вежливость/невежливость (например, автор добавляет сюда 501 позицию, чтобы фиксировать, капсом слово написано или нет), но к тому моменту я уже очень устал и не стал это делать, так что у меня это просто в ad-hoc фичах как доля таких слов.

In [15]:
glove =  'embeddings/glove.twitter.27B.200d.txt'
fasttext = 'embeddings/wiki-news-300d-1M.vec'

In [16]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == 'embeddings/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [17]:
embeddings_index_tw = load_embed(glove)

In [18]:
embeddings_index_ft = load_embed(fasttext)

In [19]:
word_index = tokenizer.word_index
nb_words = min(dict_size, len(word_index))+1

embedding_matrix = np.zeros((nb_words, embed_dim))

Эмбеддинг слова "что-то" в каждой из моделей: если мы не находим слово в модели, на его позиции в векторе встают позиции слова "что-то".

In [21]:
something_tw = embeddings_index_tw.get("something")
something_ft = embeddings_index_ft.get("something")

In [22]:
something = np.zeros((500,))
something[:300,] = something_ft
something[300:500,] = something_tw

In [23]:
def embed_word(embedding_matrix,i,word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None: 
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_vector_tw = embeddings_index_tw.get(word)
        if embedding_vector_tw is not None:
            embedding_matrix[i,300:500] = embedding_vector_tw

In [24]:
# Fasttext vector is used by itself if there is no glove vector but not the other way around.

for word, i in word_index.items():
    
    if i >= dict_size: continue
        
    if embeddings_index_ft.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        if len(word) > 20:
            embedding_matrix[i] = something
        else:
            word2 = word.title()
            if embeddings_index_ft.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
                word2 = word.upper()
                if embeddings_index_ft.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something     

#### Model block

In [25]:
from tensorflow.keras import Model, regularizers, optimizers
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GRU, GlobalMaxPooling1D, GlobalAveragePooling1D,\
 Dense, BatchNormalization, Dropout, SpatialDropout1D, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [26]:
misc_size = X_train_f.shape[1]

In [27]:
def multimodal_lstm(max_words=max_words, misc_input_length=misc_size,
                    dict_size=nb_words, embed_dim=500,
                    dropout_rate=0.2, num_classes=3):
    #Define inputs
    emb_input = Input(shape=(max_words,), name='post_body_input')
    misc_input = Input(shape=(misc_input_length), name='misc_features_input')

    # Embedding branch
    x_e = Embedding(input_dim=dict_size,
                    output_dim=embed_dim,
                    weights=[embedding_matrix],
                    input_length=max_words,  # X.shape[1]
                    trainable=False,
                    name='post_body_embedding')(emb_input)
    x_e = SpatialDropout1D(0.5)(x_e)

    x_e = Bidirectional(LSTM(units=40,
                             return_sequences=True,))(x_e)
                             #dropout=0.4,
                             #kernel_regularizer=regularizers.l2(0.01)))(x_e)
    x_e, state_h, state_c = Bidirectional(GRU(units=40,
                                          return_sequences=True,
                                          return_state=True))(x_e)
                             #dropout=0.2,
                             #kernel_regularizer=regularizers.l2(0.01)))(x_e)
    maxpool = GlobalMaxPooling1D()(x_e)
    avgpool = GlobalAveragePooling1D()(x_e)

    x_e = Concatenate()([maxpool, avgpool, state_h])  # maxpool, avgpool, скрытое состояние

   # x_e = Dropout(0.5)(x_e)

    x_e = Model(inputs=emb_input, outputs=x_e)

    z = Concatenate()([x_e.output, misc_input])  # к тем трём ещё наши дополнительные фичи

    #z = Dense(256, activation='relu')(z)
    #z = BatchNormalization(trainable=True)(z)
    #z = Dropout(dropout_rate)(z)
    z = Dense(num_classes, activation='softmax')(z)

    model = Model(inputs=[x_e.input, misc_input], outputs=z)

    return model

In [1]:
adam = optimizers.Adam(clipvalue=1)  # gradient clipping на 1 помог улучшить, упс, инстинктивно исполнил ячейку

NameError: name 'optimizers' is not defined

In [29]:
lstm_model = multimodal_lstm()
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
lstm_model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
post_body_input (InputLayer)    [(None, 900)]        0                                            
__________________________________________________________________________________________________
post_body_embedding (Embedding) (None, 900, 500)     176257500   post_body_input[0][0]            
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 900, 500)     0           post_body_embedding[0][0]        
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 900, 80)      173120      spatial_dropout1d[0][0]          
_______________________________________________________________________________________

In [30]:
#epochs = 100
epochs = 30
batch_size = 512

In [31]:
mc = ModelCheckpoint('checkpoints/best_lstm.h5', monitor='val_loss', mode='auto', save_best_only=True)
earlystop = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)
lr_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience = 2, verbose=1,factor=0.4, min_lr=0.0000001)

In [32]:
history = lstm_model.fit([X_train_e, X_train_f], y_train,
                         batch_size=batch_size,
                         validation_data=([X_val_e, X_val_f], y_val),
                         epochs=epochs,
                         callbacks=[mc, lr_reduction])
                         #callbacks=[mc, earlystop, lr_reduction])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0004000000189989805.
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00016000000759959222.
Epoch 17/30
Epoch 18/30
Epoch 00018: ReduceLROnPlateau reducing learning rate to 6.40000042039901e-05.
Epoch 19/30
Epoch 20/30
Epoch 00020: ReduceLROnPlateau reducing learning rate to 2.560000284574926e-05.
Epoch 21/30
Epoch 22/30
Epoch 00022: ReduceLROnPlateau reducing learning rate to 1.0240000847261399e-05.
Epoch 23/30
Epoch 24/30
Epoch 00024: ReduceLROnPlateau reducing learning rate to 4.09600033890456e-06.
Epoch 25/30
Epoch 26/30

KeyboardInterrupt: ignored

In [41]:
lstm_model.load_weights('checkpoints/best_lstm.h5')

In [34]:
X_test = tokenizer.texts_to_sequences(test['text'].values)
X_test = pad_sequences(X_test, maxlen=max_words)

In [35]:
predictions = lstm_model.predict([X_test, X_test_f])

In [None]:
predictions = [np.argmax(p) for p in predictions]

In [None]:
from utils import classifier_out

In [None]:
classifier_out(predictions, '20.fucking_retard_v2')