In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from nltk.probability import FreqDist
import gensim
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, BatchNormalization
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from keras.losses import categorical_crossentropy
from keras.callbacks import EarlyStopping 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spvag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
from my_tool import txt_preprocessing 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spvag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df = pd.read_excel("Отзывы.xlsx")

In [9]:
df.Rating.value_counts()

5    14585
1     2276
4     2138
3      911
2      748
Name: Rating, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20658 entries, 0 to 20657
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Rating   20658 non-null  int64 
 1   Content  20656 non-null  object
 2   Date     20658 non-null  object
dtypes: int64(1), object(2)
memory usage: 484.3+ KB


In [11]:
df = df[df.Content.notna()]

## Модель без предобученных весов

In [12]:
df['preprocess_Content'] = df['Content'].apply(lambda x: txt_preprocessing.ru_preprocessing(x, False))

In [13]:
df.head(3)

Unnamed: 0,Rating,Content,Date,preprocess_Content
0,5,It just works!,2017-08-14,
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложение минус хотеть слишко...
2,5,Отлично все,2017-08-14,отлично весь


In [14]:
max_words = 10000

In [15]:
train_corpus = " ".join(df["preprocess_Content"])
tokens = word_tokenize(train_corpus)
tokens_filtered = [word for word in tokens if word.isalnum()]
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]


In [16]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}
vocabulary

{'приложение': 1,
 'весь': 2,
 'удобный': 3,
 'очень': 4,
 'работать': 5,
 'хороший': 6,
 'спасибо': 7,
 'отличный': 8,
 'это': 9,
 'нравиться': 10,
 'телефон': 11,
 'отлично': 12,
 'супер': 13,
 'мочь': 14,
 'обновление': 15,
 'быстро': 16,
 'удобно': 17,
 'просто': 18,
 'пароль': 19,
 'банк': 20,
 'антивирус': 21,
 'пользоваться': 22,
 'сбербанк': 23,
 'устраивать': 24,
 'вход': 25,
 'пока': 26,
 'карта': 27,
 'прошивка': 28,
 'проблема': 29,
 'нужный': 30,
 'свой': 31,
 'писать': 32,
 'счет': 33,
 'программа': 34,
 'перевод': 35,
 'разработчик': 36,
 'сделать': 37,
 'время': 38,
 'ошибка': 39,
 'деньга': 40,
 'приходиться': 41,
 'код': 42,
 'норма': 43,
 'вводить': 44,
 'платеж': 45,
 'стать': 46,
 'почему': 47,
 'постоянно': 48,
 'долго': 49,
 'понятный': 50,
 'довольный': 51,
 'смс': 52,
 'рут': 53,
 'право': 54,
 'ваш': 55,
 'ок': 56,
 'который': 57,
 'функция': 58,
 'нормально': 59,
 'шаблон': 60,
 'зайти': 61,
 'вылетать': 62,
 'последний': 63,
 'возможность': 64,
 'установить'

In [17]:
def text_to_sequence(vocabulary, text, maxlen = 40):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [18]:
df['train'] = df["preprocess_Content"].apply(lambda x:text_to_sequence(vocabulary, x))

In [19]:
print(df.train[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 3649, 1, 129, 167, 359, 79, 78, 1353, 233, 11, 41, 22, 227, 453]


In [20]:
df['Rating'] = df['Rating'] - 1

In [21]:
df.Rating.value_counts()

4    14584
0     2276
3     2137
2      911
1      748
Name: Rating, dtype: int64

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df.train, df.Rating, test_size=0.25, random_state=42)
X_train = np.array([ txt for txt in X_train])

In [23]:
num_classes = 5
y_train = keras.utils.to_categorical(y_train, num_classes=5)

In [24]:
max_len = 40
epochs = 100
batch_size = 512
print_batch_n = 100

In [36]:
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len))
model_cnn.add(Conv1D(300, 3))
model_cnn.add(Activation("relu"))
model_cnn.add(GlobalMaxPool1D())
model_cnn.add(Dense(10))
model_cnn.add(Activation("relu"))
model_cnn.add(Dense(num_classes))
model_cnn.add(Activation('softmax'))

In [37]:
opt = keras.optimizers.Adam(learning_rate=0.001)
model_cnn.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [38]:
callback = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001, verbose=1)

In [39]:
%%time
history = model_cnn.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: early stopping
Wall time: 1min 32s


In [49]:
X_test = np.array([ txt for txt in X_test])
y_test = keras.utils.to_categorical(y_test, num_classes=5)

In [50]:
model_cnn.evaluate(X_test, y_test)



[0.6810378432273865, 0.7703330516815186]

## Модель с предобученными весами

In [51]:
df['preprocess_pos_Content'] = df['Content'].apply(lambda x: txt_preprocessing.ru_preprocessing(x, True))

In [52]:
df['preprocess_pos_Content'][1]

'целое_NOUN удобноной_ADJ приложение_NOUN минус_NOUN хотеть_VERB слишком_ADV большой_ADJ доступ_NOUN персональный_ADJ данные_NOUN телефон_NOUN приходиться_VERB пользоваться_VERB ограничить_VERB режим_NOUN'

In [53]:
model = gensim.models.KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [54]:
matrix_weights = model.vectors
matrix_weights.shape

(248978, 300)

In [55]:
vocab = {word: i for i, word in enumerate(model.index_to_key)}

In [56]:
def text_to_sequence_pos(vocabulary, text, maxlen = 40):
    result = [] 
    for word in text.split():      
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [57]:
df['train_pos'] = df["preprocess_pos_Content"].apply(lambda x:text_to_sequence_pos(vocab, x))

In [58]:
print(df.train_pos[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1540, 2379, 14612, 168, 922, 34, 2302, 3161, 152, 1724, 355, 864, 21017, 1000]


In [59]:
model.index_to_key[1540]

'целое_NOUN'

In [60]:
X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(df.train_pos, df.Rating, test_size=0.25, random_state=42)
X_train_pos = np.array([ txt for txt in X_train_pos])

In [61]:
y_train_pos = keras.utils.to_categorical(y_train_pos, num_classes=5)

In [62]:
max_words_pos = matrix_weights.shape[0]

In [67]:
model_cnn_pos = Sequential()
model_cnn_pos.add(Embedding(input_dim=max_words_pos, output_dim=300, input_length=max_len, trainable=False))
model_cnn_pos.add(Conv1D(300, 3))
model_cnn_pos.add(Activation("relu"))
model_cnn_pos.add(GlobalMaxPool1D())
model_cnn_pos.add(Dense(10))
model_cnn_pos.add(Activation("relu"))
model_cnn_pos.add(Dense(num_classes))
model_cnn_pos.add(Activation('softmax'))

In [68]:
opt = keras.optimizers.Adam(learning_rate=0.001)
model_cnn_pos.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

In [69]:
matrix = []
matrix.append(matrix_weights)
model_cnn_pos.layers[0].set_weights(matrix)

In [70]:
%%time
history_pos = model_cnn_pos.fit(X_train_pos, y_train_pos,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: early stopping
Wall time: 32.1 s


In [71]:
X_test_pos = np.array([ txt for txt in X_test_pos])
y_test_pos = keras.utils.to_categorical(y_test_pos, num_classes=5)

In [72]:
model_cnn_pos.evaluate(X_test_pos, y_test_pos)



[0.6470345854759216, 0.777110755443573]

In [84]:
res = pd.DataFrame({'embeddings': ['не предобученные', 'предобученные'], 
                    'Time': ['1 min 32 sec', '32 sec'], 'accuracy':[0.770333, 0.777110], 
                    'eposhs':[8,7]})
res

Unnamed: 0,embeddings,Time,accuracy,eposhs
0,не предобученные,1 min 32 sec,0.770333,8
1,предобученные,32 sec,0.77711,7


### Выводы:

В ходе работы рассмотрены две модели с предобученными весами и без них.<br> 
Предобученный веса были получены https://rusvectores.org/ru/<br> 
Модель «ruwikiruscorpora_upos_skipgram_300_2_2019»<br> 
Ембеддинги предобученной модели не обучались.<br> 

При всех одинаковых прочих параметрах модель с предобученными весами показала лучший результат. Что показывает нам пользу использования предобученных моделей. 
