In [None]:
max_words = 1000
max_len = 50
num_classes = 6

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [None]:
import pandas as pd

data = pd.read_excel('text.xls')
X = data['Content']
y = data['Rating']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

NameError: ignored

In [None]:
df_train_dict = {'Content':X_train.values, 'Rating':y_train.values}
df_train = pd.DataFrame(df_train_dict)
df_test_dict = {'Content':X_test.values, 'Rating':y_test.values}
df_test = pd.DataFrame(df_test_dict)
df_val_dict = {'Content':X_val.values, 'Rating':y_val.values}
df_val = pd.DataFrame(df_val_dict)

In [None]:
!pip install stop_words
!pip install pymorphy2

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 3.1MB/s 
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/3a/79/bea0021eeb7eeefde22ef9e96badf174068a2dd20264b9a378f2be1cdd9e/pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2MB)
[K     |████████████████████████████████| 8.2MB 6.3MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [None]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [None]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train["text"] = df_train['Content'].apply(preprocess_text)
df_val["text"] = df_val['Content'].apply(preprocess_text)
df_test["text"] = df_test['Content'].apply(preprocess_text)

In [None]:
train_corpus = " ".join(df_train["text"])
train_corpus = train_corpus.lower()

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [None]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [None]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [None]:
import numpy as np

def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [None]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["text"]], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test["text"]], dtype=np.int32)
x_val = np.asarray([text_to_sequence(text, max_len) for text in df_val["text"]], dtype=np.int32)

In [None]:
x_train

array([[  0,   0,   0, ...,   0,   0,  40],
       [  0,   0,   0, ...,   0,   6,   2],
       [  0,   0,   0, ...,   0,   0,   6],
       ...,
       [  0,   0,   0, ..., 207, 134,  42],
       [  0,   0,   0, ...,   0,   0,   6],
       [  0,   0,   0, ...,   0,   0,   1]], dtype=int32)

In [None]:
x_train.shape

(12456, 50)

In [None]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping  

In [None]:
num_classes = 6
y_train = keras.utils.to_categorical(df_train['Rating'], num_classes)
y_val = keras.utils.to_categorical(df_val['Rating'], num_classes)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=256, input_length=max_len))
model.add(Conv1D(256, 3))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  


history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [None]:
score = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.8425679206848145
Test accuracy: 0.7292418479919434


In [None]:
results = model.predict(x_test, batch_size=batch_size, verbose=1)



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score 

In [None]:
res = []

for min_ran in range(1, 3):
    for max_ran in range(2, 8):
        for an in ['word', 'char']:
            vect = TfidfVectorizer(ngram_range=(min_ran, max_rain), analyzer=an, lowercase=False)
            train_ft = vect.fit_transform(df_train['text'])
            valid_ft = vect.transform(df_val['text'])
            lgr = LogisticRegression()
            lgr.fit(train_ft, df_train['Rating'].to_numpy())
            y_pred = lgr.predict(valid_ft)
            dt = {'min_r':min_ran, 'max_r':max_ran, 'an': an, 'res':accuracy_score(df_val['Rating'].to_numpy(), y_pred)}
            res.append(dt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [None]:
from pprint import pprint

pprint(res)

[{'an': 'word', 'max_r': 2, 'min_r': 1, 'res': 0.7501805054151625},
 {'an': 'char', 'max_r': 2, 'min_r': 1, 'res': 0.7407942238267148},
 {'an': 'word', 'max_r': 3, 'min_r': 1, 'res': 0.7501805054151625},
 {'an': 'char', 'max_r': 3, 'min_r': 1, 'res': 0.7407942238267148},
 {'an': 'word', 'max_r': 4, 'min_r': 1, 'res': 0.7501805054151625},
 {'an': 'char', 'max_r': 4, 'min_r': 1, 'res': 0.7407942238267148},
 {'an': 'word', 'max_r': 5, 'min_r': 1, 'res': 0.7501805054151625},
 {'an': 'char', 'max_r': 5, 'min_r': 1, 'res': 0.7407942238267148},
 {'an': 'word', 'max_r': 6, 'min_r': 1, 'res': 0.7501805054151625},
 {'an': 'char', 'max_r': 6, 'min_r': 1, 'res': 0.7407942238267148},
 {'an': 'word', 'max_r': 7, 'min_r': 1, 'res': 0.7501805054151625},
 {'an': 'char', 'max_r': 7, 'min_r': 1, 'res': 0.7407942238267148},
 {'an': 'word', 'max_r': 2, 'min_r': 2, 'res': 0.6981949458483755},
 {'an': 'char', 'max_r': 2, 'min_r': 2, 'res': 0.7415162454873646},
 {'an': 'word', 'max_r': 3, 'min_r': 2, 'res': 0

In [None]:
accuracy_score(df_val['Rating'].to_numpy(), y_pred)

0.7415162454873646

In [None]:
from gensim.models import Word2Vec

In [None]:
modelW2V = Word2Vec(sentences=df_train['text'].apply(str.split), size=100, window=5, min_count=5, workers=8)

In [None]:
vect_idf = TfidfVectorizer()
vect_idf.fit_transform(df_train['text'])
tfidf = dict(zip(vect_idf.get_feature_names(), vect_idf.idf_))

In [None]:
from collections import defaultdict

In [None]:
max_idf = max(vect_idf.idf_)

word2weight = defaultdict(
    lambda: max_idf,
    [(w, vect_idf.idf_[i]) for w, i in vect_idf.vocabulary_.items()])

In [None]:
def get_vect_mean(txt):
    vector_w2v = np.zeros(100)
    n_w2v = 0
    for wrd in txt.split():
        if wrd in modelW2V:
            vector_w2v += modelW2V[wrd]*1
            n_w2v += 1
    if n_w2v > 0:
        vector_w2v = vector_w2v / n_w2v
    return vector_w2v

def get_vect_idf(txt):
    vector_w2v = np.zeros(100)
    n_w2v = 0
    for wrd in txt.split():
        if wrd in modelW2V:
            iddf_ = tfidf.get(wrd, 1.)
            vector_w2v += modelW2V[wrd]*iddf_
            n_w2v += iddf_
    if n_w2v > 0:
        vector_w2v = vector_w2v / n_w2v
    return vector_w2v

In [None]:
from tqdm import tqdm_notebook

In [None]:
arr_vect = []
for txt in tqdm_notebook(df_train['text']):
    arr_vect.append(get_vect_mean(txt))
    
arr_vect_valid = []
for txt in tqdm_notebook(df_val['text']):
    arr_vect_valid.append(get_vect_mean(txt))
    
train_w2v = np.asarray(arr_vect)    
valid_w2v = np.asarray(arr_vect_valid)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=12456.0), HTML(value='')))

  """
  





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=1385.0), HTML(value='')))




In [None]:
lgr_w2v = LogisticRegression()

In [None]:
lgr_w2v.fit(train_w2v, df_train['Rating'].to_numpy())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = lgr_w2v.predict(valid_w2v)

In [None]:
accuracy_score(df_val['Rating'].to_numpy(), y_pred)

0.6851985559566787

In [None]:
arr_vect = []
for txt in tqdm_notebook(df_train['text']):
    arr_vect.append(get_vect_idf(txt))
    
arr_vect_valid = []
for txt in tqdm_notebook(df_val['text']):
    arr_vect_valid.append(get_vect_idf(txt))
    
train_w2v = np.asarray(arr_vect)    
valid_w2v = np.asarray(arr_vect_valid)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=12456.0), HTML(value='')))

  app.launch_new_instance()





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=1385.0), HTML(value='')))




In [None]:
lgr_w2v = LogisticRegression()
lgr_w2v.fit(train_w2v, df_train['Rating'].to_numpy())
y_pred = lgr_w2v.predict(valid_w2v)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
accuracy_score(df_val['Rating'].to_numpy(), y_pred)

0.6787003610108303