# Тема “Сверточные нейронные сети для анализа текста”

Задания:
<ol>
<li><a href="#task_1">Учим conv сеть для классификации</a>  
<li><a href = "#task_2">Рассмотреть 2-а варианта сеточек</a>
<ol><li type="1"><a href = "#task_2.1">Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/</a>
    <li type="1"><a href="#task_2.2">Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)</a></ol>
<li ><a href="#task_3">Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше</a>

</ol>

## Импорт библиотек

In [1]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import tensorflow

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard 
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping  

## Загрузка и предобработка данных
> ### Берем отызывы за лето (из архива с материалами или предыдущего занятия)

In [2]:
# !pip install xlrd

In [3]:
data_df = pd.read_excel('../data/отзывы за лето.xls')
print(f'Shape:\n{data_df.shape}\n'
      f'Describe:\n{data_df.describe().T}\n'
      f'Column name:\n{data_df.columns}\n')
data_df.head()

Shape:
(20659, 3)
Describe:
          count      mean       std  min  25%  50%  75%  max
Rating  20659.0  4.259015  1.348884  1.0  4.0  5.0  5.0  5.0
Column name:
Index(['Rating', 'Content', 'Date'], dtype='object')



Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [4]:
data_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,20659.0,4.259015,1.348884,1.0,4.0,5.0,5.0,5.0


### Меняем рейтинг [1:5] -> [0:4] (для перевода в .to_categorical)

In [5]:
data_df['Rating'].unique()

array([5, 4, 2, 3, 1])

In [6]:
data_df['Rating'] = data_df['Rating'].replace({idx+1: idx for idx in range(5)})
data_df['Rating'].unique()

array([4, 3, 1, 2, 0])

In [7]:
data_df

Unnamed: 0,Rating,Content,Date
0,4,It just works!,2017-08-14
1,3,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,4,Отлично все,2017-08-14
3,4,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,4,"Очень удобно, работает быстро.",2017-08-14
...,...,...,...
20654,0,"Ну и шляпа,с роот правами бесполезная прога,ра...",2017-06-01
20655,4,Ок,2017-06-01
20656,3,Доволен,2017-06-01
20657,0,"Песопаснасть, рут ни нужын",2017-06-01


In [8]:
print(f'Shape:\n{data_df.shape}\n'
      f'Describe:\n{data_df.describe().T}\n'
      f'Column name:\n{data_df.columns}\n')
data_df.head()

Shape:
(20659, 3)
Describe:
          count      mean       std  min  25%  50%  75%  max
Rating  20659.0  3.259015  1.348884  0.0  3.0  4.0  4.0  4.0
Column name:
Index(['Rating', 'Content', 'Date'], dtype='object')



Unnamed: 0,Rating,Content,Date
0,4,It just works!,2017-08-14
1,3,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,4,Отлично все,2017-08-14
3,4,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,4,"Очень удобно, работает быстро.",2017-08-14


### Предобработка данных

In [9]:
#!pip install stop-words

In [10]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [11]:
sw_ru = set(get_stop_words("ru"))
sw_en = set(get_stop_words("en"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw_ru or sw_en]
    return " ".join(txt)


In [12]:
data_df['Content'] = data_df['Content'].apply(preprocess_text)
data_df.head(3)

Unnamed: 0,Rating,Content,Date
0,4,it just works,2017-08-14
1,3,в целое удобноной приложениеиз минус хотеть сл...,2017-08-14
2,4,отлично всё,2017-08-14


#### Разделение выборки

In [13]:
df_train, df_test = train_test_split(data_df, test_size=0.2, random_state=21,)
print(f'data_df shape:\t{data_df.shape}\n'
      f'train_dfshape:\t{df_train.shape}\n'
      f'test_df shape:\t{df_test.shape}\n') 

data_df shape:	(20659, 3)
train_dfshape:	(16527, 3)
test_df shape:	(4132, 3)



In [14]:
train_corpus = " ".join(df_train['Content'])
train_corpus = train_corpus.lower()

In [15]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /home/oleg_rev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
max_words = 200
max_len = 40
num_classes = data_df["Rating"].nunique()

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [17]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [18]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [19]:
tokens_filtered_top[:21]

['приложение',
 'всё',
 'и',
 'очень',
 'удобно',
 'в',
 'я',
 'на',
 'работать',
 'с',
 'удобный',
 'что',
 'отлично',
 'спасибо',
 'хороший',
 'это',
 'нравиться',
 'отличный',
 'раз',
 'по',
 'хорошо']

In [20]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [21]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [22]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train['Content']], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test['Content']], dtype=np.int32)


In [23]:
x_train.shape


(16527, 40)

In [24]:
x_test.shape

(4132, 40)

# Выполнение заданий

<p><a name="task_1"></a></p>

## 1. Учим conv сеть для классификации.

In [25]:
y_train = keras.utils.to_categorical(df_train['Rating'], num_classes)
y_test = keras.utils.to_categorical(df_test['Rating'], num_classes)

<p><a name="task_2"></a></p>

## 2. Рассмотреть 2-а варианта сеточек


<p><a name="task_2.1"></a></p>

### 2.1. Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/


In [32]:
import gensim

In [33]:
model_path = '../data/RusVectores/ruscorpora_upos_cbow_300_20_2019/model.bin'

model_ru = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
#model = Sequential()
#model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
#model.add(Conv1D(128, 3), padding='same')
#model.add(Activation("relu"))
#model.add(GlobalMaxPool1D())
#model.add(Dense(10))
#model.add(Activation("relu"))
#model.add(Dense(num_classes))
#model.add(Activation('softmax'))

In [None]:
#model.compile(loss='categorical_crossentropy',
#              optimizer='adam',
#              metrics=['accuracy'])

In [None]:
#tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
#early_stopping=EarlyStopping(monitor='val_loss')  


#history = model.fit(x_train, y_train,
#                    batch_size=batch_size,
#                    epochs=epochs,
#                    verbose=1,
#                    validation_split=0.1,
#                   callbacks=[tensorboard, early_stopping])

<p><a name="task_2.2"></a></p>

### 2.2. Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)


In [27]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3, padding='same'))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [28]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [29]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

2021-09-26 23:26:02.017034: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-09-26 23:26:02.017121: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-09-26 23:26:02.017216: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-09-26 23:26:02.104937: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-09-26 23:26:02.123475: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2595140000 Hz


Epoch 1/20
 2/30 [=>............................] - ETA: 11s - loss: 1.5772 - accuracy: 0.3281

2021-09-26 23:26:03.281304: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-09-26 23:26:03.281342: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.


 3/30 [==>...........................] - ETA: 8s - loss: 1.5697 - accuracy: 0.3974 

2021-09-26 23:26:03.462782: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2021-09-26 23:26:03.464861: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-09-26 23:26:03.467002: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_09_26_23_26_03
2021-09-26 23:26:03.468075: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to ./logs/train/plugins/profile/2021_09_26_23_26_03/localhost.localdomain.trace.json.gz
2021-09-26 23:26:03.485728: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/train/plugins/profile/2021_09_26_23_26_03
2021-09-26 23:26:03.486481: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for memory_profile.json.gz to ./logs/train/plugins/profile/2021_09_26_23_26_03/localhost.localdomain.memory_profile.json.gz
2021-

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


In [30]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.6670687198638916
Test accuracy: 0.764762818813324


In [31]:
results = model.predict(x_test, batch_size=batch_size, verbose=1)



<p><a name="task_3"></a></p>

## 3. Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше