## Задача (Sentiment Analysis)

Необходимо произвести классификацию.

In [1]:
import pandas as pd

df = pd.read_excel("/content/drive/Othercomputers/Мое устройство Компьютер/Google.Disk/Colab Notebooks/data/отзывы за лето.xlsx")

In [2]:
df.iloc[:5]

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


### Предобработка

In [3]:
! pip install pymorphy2 stop_words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.0 MB/s 
[?25hCollecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Collecting docopt>=0.6
  Downloading docopt-0.6.2.tar.gz (25 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 3.1 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: docopt, stop-words
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13723 sha256=e7370e8d5c22397f55d7f3d13d19816faf6da93462a9852875b3ecf05a0e9d0c
  Stored in directory: /root/.cache/pip/wheels/72/b0/3f/1d95f96ff986c7d

In [4]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
from multiprocessing import Pool
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re

sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

regex = re.compile("[А-Яа-я0-1:=!\)\()A-z\_\%/|]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text)
    except:
        return []

def lemmatize(text, pymorphy=morpher):
    try:
        return " ".join([pymorphy.parse(word)[0].normal_form for word in text if word not in sw])
    except:
        return " "  

def clean_text(text):
    return lemmatize(words_only(text))


In [5]:
with Pool(8) as p:
    lemmas = list(tqdm(p.imap(clean_text, df['Content']), total=len(df)))

    
df['lemmas'] = lemmas
df.head()

  0%|          | 0/20659 [00:00<?, ?it/s]

Unnamed: 0,Rating,Content,Date,lemmas
0,5,It just works!,2017-08-14,it just works!
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14,в целое удобноной приложение минус хотеть боль...
2,5,Отлично все,2017-08-14,отлично
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,стать зависать 1% работа антивирус далёкий ран...
4,5,"Очень удобно, работает быстро.",2017-08-14,очень удобно работать быстро


Запишем полученные данные в формате для обучения классификатора:

In [6]:
X = df.lemmas.tolist()
y = df.Rating.tolist()

X, y = np.array(X), np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
print ("total train examples %s" % len(y_train))
print ("total test examples %s" % len(y_test))

total train examples 14461
total test examples 6198


In [7]:
train_corpus = " ".join(X_train)

In [8]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Отфильтруем данные и соберём в корпус N наиболее частых токенов

In [9]:
max_words = 100000
max_len = 300
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [10]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [11]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [12]:
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'очень',
 'работать',
 'удобный',
 'всё',
 'вс',
 'отлично',
 'я',
 'спасибо']

In [13]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [14]:
import numpy as np

def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [15]:
X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [16]:
X_train.shape, X_test.shape

((14461, 300), (6198, 300))

In [17]:
X_train[100]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

## Keras model

In [18]:
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, Flatten
from keras.callbacks import TensorBoard 
from keras.losses import CategoricalCrossentropy
from keras.callbacks import EarlyStopping  

In [19]:
y_train.shape

(14461,)

In [20]:
num_classes = 6
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [21]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=max_len, input_length=max_len))
model.add(Conv1D(max_len, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [22]:
model.compile(loss='CategoricalCrossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [23]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [24]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7931757569313049
Test accuracy: 0.7073249220848083


### Используем предобученную модель эмбеддингов

In [25]:
!pip install wget
!pip install gensim --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=54642f9d6a7607ea02af4df5a0a74033a54ca048749bc637c75161997bb8622d
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.6 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling 

In [26]:
import zipfile
import wget
import sys
import gensim, logging

model_url = 'http://vectors.nlpl.eu/repository/11/180.zip'
m = wget.download(model_url)
model_file = model_url.split('/')[-1]
with zipfile.ZipFile(model_file, 'r') as archive:
    stream = archive.open('model.bin')
    model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

In [27]:
for n in model.most_similar(positive=[u'тушение_NOUN']):
    print (n[0], n[1])

тушений_NOUN 0.7801563143730164
противопожарный_ADJ 0.635464608669281
лесопожарный_ADJ 0.6165294647216797
пожаротушение_NOUN 0.6065576672554016
возгорание_NOUN 0.6054503917694092
пожар_NOUN 0.5795326232910156
задымление_NOUN 0.568307101726532
пожарный_NOUN 0.5492812395095825
загорание_NOUN 0.5449301600456238
пожароопасный_ADJ 0.5265752077102661


In [28]:
n = model.vectors

In [29]:
n.shape

(189193, 300)

In [30]:
model = Sequential()
model.add(Embedding(input_dim=189193, output_dim=300, input_length=max_len, weights=[n]))
model.add(Conv1D(300, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [31]:
model.compile(loss='CategoricalCrossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [32]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20


In [33]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.7898061275482178
Test accuracy: 0.7504033446311951


### Вывод: Модель с предобеденными векторами показала немного лучший результат чем модель ил коробки. Но я подозреваю, что я что-то сделан не так.