## Урок 7. Сверточные нейронные сети для анализа текста


### Задание
Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации

2. Рассмотреть 2-а варианта сеточек

2.1 Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/

2.2 Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)

Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше



In [2]:
!pip install stop_words

Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: stop_words
  Building wheel for stop_words (setup.py) ... [?25ldone
[?25h  Created wheel for stop_words: filename=stop_words-2018.7.23-py3-none-any.whl size=32911 sha256=fb954ab3a844fd544862fb959a995a4412d691b92db126f00b7b62fb28e23976
  Stored in directory: /Users/Olga/Library/Caches/pip/wheels/da/d8/66/395317506a23a9d1d7de433ad6a7d9e6e16aab48cf028a0f60
Successfully built stop_words
Installing collected packages: stop_words
Successfully installed stop_words-2018.7.23


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
from nltk.probability import FreqDist

import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping  

from sklearn.preprocessing import LabelEncoder
import gensim

[nltk_data] Downloading package punkt to /Users/Olga/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
import keras.backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [5]:
df = pd.read_excel('отзывы за лето.xls')
df.head(10)

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14
5,5,Всё удобно норм 👍👍👍,2017-08-14
6,5,Очень удобное приложение.,2017-08-14
7,5,Все устраивает,2017-08-14
8,5,У меня работает все четко. В отличии от банком...,2017-08-14
9,5,Очень все хорошо👍,2017-08-14


In [6]:
df['Rating'].value_counts()

5    14586
1     2276
4     2138
3      911
2      748
Name: Rating, dtype: int64

In [7]:
df_train, df_test = train_test_split(df, test_size=0.33, random_state=42)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train.shape, df_test.shape

((13841, 3), (6818, 3))

In [8]:
df_train

Unnamed: 0,Rating,Content,Date
0,5,Наконец-то исправили эту чушь с неоргинальной ...,2017-08-09
1,5,Удобно в использовании,2017-07-27
2,5,Отлично,2017-08-08
3,5,Класс,2017-07-25
4,5,Удобно,2017-07-08
...,...,...,...
13836,4,Все нравится,2017-07-29
13837,5,Очень смешно программа пугается рута :),2017-07-28
13838,1,Не могу скачать ошибка номер 24,2017-08-06
13839,5,Сбербанк всегда рядом,2017-08-12


In [9]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['Content'] = df_train['Content'].apply(preprocess_text)
df_test['Content'] = df_test['Content'].apply(preprocess_text)

In [10]:
df_train.head()

Unnamed: 0,Rating,Content,Date
0,5,наконецтый исправить чушь снеоргинальный проши...,2017-08-09
1,5,удобно использование,2017-07-27
2,5,отлично,2017-08-08
3,5,класс,2017-07-25
4,5,удобно,2017-07-08


In [None]:
train_corpus = " ".join(df_train["Content"])
train_corpus = train_corpus.lower()
train_corpus

In [None]:
tokens = word_tokenize(train_corpus)
tokens

In [13]:
tokens_filtered = [word for word in tokens if word.isalnum()]

dist = FreqDist(tokens_filtered)
dist

FreqDist({'приложение': 4123, 'удобно': 2201, 'работать': 1288, 'удобный': 1182, 'отлично': 860, 'нравиться': 763, 'хороший': 681, 'отличный': 677, 'телефон': 627, 'супер': 540, ...})

In [14]:
max_words = 200

tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'хороший',
 'отличный',
 'телефон',
 'супер']

In [None]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}
vocabulary

In [16]:
max_len = 40

def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [17]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["Content"]], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test["Content"]], dtype=np.int32)

In [18]:
x_train

array([[  0,   0,   0, ...,   1,   2,  15],
       [  0,   0,   0, ...,   0,   2, 181],
       [  0,   0,   0, ...,   0,   0,   5],
       ...,
       [  0,   0,   0, ..., 164,  27,  84],
       [  0,   0,   0, ...,   0,   0,  20],
       [  0,   0,   0, ...,   0, 113,   5]], dtype=int32)

In [19]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(df_train['Rating']) 
test_enc_labels = le.transform(df_test['Rating'])
le.classes_

array([1, 2, 3, 4, 5])

In [20]:
train_enc_labels

array([4, 4, 4, ..., 0, 4, 4])

In [21]:
num_classes = 5
y_train = tf.keras.utils.to_categorical(train_enc_labels, num_classes=num_classes)
y_test = tf.keras.utils.to_categorical(test_enc_labels, num_classes=num_classes)
y_train

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [22]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len)) 

model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))


In [23]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[get_f1])

In [24]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  

epochs = 20
batch_size = 512

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

In [25]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test f1_score:', score[1])



Test loss: 1.1623296737670898
Test f1_score: 0.7151089310646057


In [None]:
#http://vectors.nlpl.eu/repository/
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('./180/model.bin', binary=True)  
len(word_vectors), len(word_vectors[1]), word_vectors[1]

In [None]:
word_vectors_matrix = [word_vectors[i][:128] for i in range(200)]
word_vectors_matrix

In [38]:
initializer = tf.keras.initializers.Constant(word_vectors_matrix)

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, embeddings_initializer =initializer, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [39]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[get_f1])

In [40]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  

epochs = 20
batch_size = 512

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

2022-07-23 17:43:58.427545: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-07-23 17:43:58.428237: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-07-23 17:43:58.435250: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


Epoch 1/20
 2/25 [=>............................] - ETA: 5s - loss: 1.5561 - get_f1: 0.5274 

2022-07-23 17:44:00.721624: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-07-23 17:44:00.721828: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-07-23 17:44:00.996098: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-07-23 17:44:01.009287: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-07-23 17:44:01.032499: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./logs/train/plugins/profile/2022_07_23_17_44_01

2022-07-23 17:44:01.036190: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./logs/train/plugins/profile/2022_07_23_17_44_01/MacBook-Air-Olga.local.trace.json.gz
2022-07-23 17:44:01.069938: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./logs/train/plugins/profile/2022_07_23_17_44_01

2022-07-23 17



In [41]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test f1_score:', score[1])



Test loss: 0.8247653841972351
Test f1_score: 0.7317059636116028
