# Тема “Сверточные нейронные сети для анализа текста”

Задания:
<ol>
<li><a href="#task_1">Учим conv сеть для классификации</a>  
<li><a href = "#task_2">Рассмотреть 2-а варианта сеточек</a>
<ol><li type="1"><a href = "#task_2.1">Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/</a>
    <li type="1"><a href="#task_2.2">Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)</a></ol>
<li ><a href="#task_3">Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше</a>

</ol>

## Импорт библиотек

In [1]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import tensorflow

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard 
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping  

## Загрузка и предобработка данных
> ### Берем отызывы за лето (из архива с материалами или предыдущего занятия)

In [2]:
# !pip install xlrd

In [3]:
data_df = pd.read_excel('../data/отзывы за лето.xls')
print(f'Shape:\n{data_df.shape}\n'
      f'Describe:\n{data_df.describe().T}\n'
      f'Column name:\n{data_df.columns}\n')
data_df.head()

Shape:
(20659, 3)
Describe:
          count      mean       std  min  25%  50%  75%  max
Rating  20659.0  4.259015  1.348884  1.0  4.0  5.0  5.0  5.0
Column name:
Index(['Rating', 'Content', 'Date'], dtype='object')



Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [4]:
data_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,20659.0,4.259015,1.348884,1.0,4.0,5.0,5.0,5.0


### Предобработка данных

In [5]:
#!pip install stop-words

In [6]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [7]:
sw_ru = set(get_stop_words("ru"))
sw_en = set(get_stop_words("en"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw_ru or sw_en]
    return " ".join(txt)


In [8]:
data_df['Content'] = data_df['Content'].apply(preprocess_text)
data_df.head(3)

Unnamed: 0,Rating,Content,Date
0,5,it just works,2017-08-14
1,4,в целое удобноной приложениеиз минус хотеть сл...,2017-08-14
2,5,отлично всё,2017-08-14


#### Разделение выборки

In [9]:
df_train, df_test = train_test_split(data_df, test_size=0.2, random_state=21,)
print(f'data_df shape:\t{data_df.shape}\n'
      f'train_dfshape:\t{df_train.shape}\n'
      f'test_df shape:\t{df_test.shape}\n') 

data_df shape:	(20659, 3)
train_dfshape:	(16527, 3)
test_df shape:	(4132, 3)



In [10]:
train_corpus = " ".join(df_train['Content'])
train_corpus = train_corpus.lower()

In [11]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /home/oleg_rev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
max_words = 200
max_len = 40
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [13]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [14]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [15]:
tokens_filtered_top[:10]

['приложение', 'всё', 'и', 'очень', 'удобно', 'в', 'я', 'на', 'работать', 'с']

In [16]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [17]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [19]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train['Content']], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test['Content']], dtype=np.int32)


In [20]:
x_train.shape


(16527, 40)

In [21]:
x_test.shape

(4132, 40)

# Выполнение заданий

<p><a name="task_1"></a></p>

## 1. Учим conv сеть для классификации.

In [35]:
df_train['Rating'].max()
type(df_train["class"].unique()[0])

numpy.int64

In [37]:
df_train['Rating'].shape

(16527,)

In [42]:
df_train["Rating"].unique()

array([5, 4, 1, 2, 3])

In [44]:
num_classes=6
y_train = keras.utils.to_categorical(df_train['Rating'], num_classes)
y_test = keras.utils.to_categorical(df_test['Rating'], num_classes)

In [45]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

2021-09-26 00:48:39.804273: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-09-26 00:48:39.804943: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-26 00:48:39.808297: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [46]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [47]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

2021-09-26 00:48:44.592118: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-09-26 00:48:44.592191: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-09-26 00:48:44.592771: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-09-26 00:48:44.664291: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-09-26 00:48:44.682768: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2595000000 Hz


Epoch 1/20
 2/30 [=>............................] - ETA: 5s - loss: 1.7903 - accuracy: 0.1343     

2021-09-26 00:48:45.593447: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-09-26 00:48:45.593490: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


2021-09-26 00:48:45.766377: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2021-09-26 00:48:45.770635: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-09-26 00:48:45.775752: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_09_26_00_48_45
2021-09-26 00:48:45.778283: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to ./logs/train/plugins/profile/2021_09_26_00_48_45/localhost.localdomain.trace.json.gz
2021-09-26 00:48:45.792402: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_09_26_00_48_45
2021-09-26 00:48:45.793175: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for memory_profile.json.gz to ./logs/train/plugins/profile/2021_09_26_00_48_45/localhost.localdomain.memory_profile.json.gz
2021-

In [48]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.6905776262283325
Test accuracy: 0.765488862991333


In [49]:
results = model.predict(x_test, batch_size=batch_size, verbose=1)



<p><a name="task_2"></a></p>

## 2. Рассмотреть 2-а варианта сеточек


<p><a name="task_2.1"></a></p>

### 2.1. Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/


<p><a name="task_2.2"></a></p>

### 2.2. Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)


<p><a name="task_3"></a></p>

## 3. Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше