In [1]:
from pymystem3 import Mystem
import re
import numpy as np
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import pandas as pd
mystem = Mystem()

In [40]:
import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from gensim.models import KeyedVectors

In [58]:
#training params
batch_size = 256 
num_epochs = 8 

#model parameters
num_filters = 64 
embed_dim = 300 
weight_decay = 1e-4

In [2]:
df_full_data = pd.read_json("full_data.json")

In [3]:
with open("stop.txt") as f:
    stop = set([w.strip() for w in f.readlines()])

In [4]:
df_full_data.mark.value_counts()

1    25951
2     5649
3     2129
5     1901
4      489
Name: mark, dtype: int64

In [5]:
def lemmatized_text(s):
    return " ".join([w.lower() for w in mystem.lemmatize(s) if (re.match(r"[а-яА-ЯЁёЙй]{2,}", w))])

In [6]:
df_full_data["concat"] = df_full_data["Title"] + " " + df_full_data["text"]
#df_full_data["hour"] = df_full_data["time"].apply(lambda x: x.hour)
df_full_data["prep"] = df_full_data["concat"].apply(lemmatized_text)

In [8]:
df_full_data["len1"] = df_full_data["concat"].apply(len)
df_full_data["len2"] = df_full_data["prep"].apply(lambda x: len(x.split(" ")))

In [9]:
df_full_data["len2"].mean(), df_full_data["len2"].max()

(214.50164733242892, 4286)

In [15]:
w2v = KeyedVectors.load_word2vec_format("ft_native_300_ru_wiki_lenta_lemmatize.vec", binary=False)

In [20]:
Y = pd.get_dummies(df_full_data.mark)

In [23]:
X_test = df_full_data.prep.iloc[:500].values
X_train = df_full_data.prep.iloc[500:].values
Y_test = Y.iloc[:500].values
Y_train = Y.iloc[500:].values

In [26]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [29]:
word_seq_train = tokenizer.texts_to_sequences(X_train)
word_seq_test = tokenizer.texts_to_sequences(X_test)

In [30]:
word_index = tokenizer.word_index

In [32]:
max_seq_len = 400
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

In [57]:
nb_words = len(word_index) + 1
embedding_matrix = np.zeros((nb_words, 300))
for word, i in word_index.items():
    if word in w2v:
        embedding_vector = w2v[word]
        embedding_matrix[i] = embedding_vector

In [60]:
#CNN architecture
model = Sequential()
model.add(Embedding(nb_words, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(5, activation='softmax'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 400, 300)          14556600  
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 400, 64)           134464    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 200, 64)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 200, 64)           28736     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
__________

In [61]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=2, verbose=1)
callbacks_list = [early_stopping]

In [62]:
hist = model.fit(word_seq_train, Y_train, batch_size=batch_size, epochs=num_epochs,
                 callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=True)

Train on 32057 samples, validate on 3562 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [72]:
print(classification_report(Y_test.argmax(axis=1), model.predict_classes(word_seq_test)))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91       399
           1       0.00      0.00      0.00        52
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00         9
           4       0.75      0.69      0.72        26

    accuracy                           0.83       500
   macro avg       0.32      0.34      0.33       500
weighted avg       0.71      0.83      0.77       500



  _warn_prf(average, modifier, msg_start, len(result))
