In [1]:
import re
import numpy as np

from nltk import tokenize, TweetTokenizer

from keras import backend as K
from keras.models import load_model

from gensim.models import KeyedVectors

from pathlib import Path

from sklearn.externals import joblib

Using TensorFlow backend.


In [2]:
DATA_DIR = Path('../data/')
EMBEDDINGS_DIR = DATA_DIR.joinpath('embeddings/')
PATH_TO_EMBEDDINGS = EMBEDDINGS_DIR.joinpath('fasttext.min_count_100.vk_posts_all_443550246.300d.vec')
PATH_TO_SCALER = DATA_DIR.joinpath('scaler.pkl')

PATH_TO_MODEL = DATA_DIR.joinpath('mlp.h5')
PATH_TO_MLB = DATA_DIR.joinpath('mlb.pkl')

In [3]:
class MLModel():
    
    def __init__(self):
        self.mlb = joblib.load(str(PATH_TO_MLB))
        self.model = load_model(str(PATH_TO_MODEL), custom_objects={'precision': self.precision,
                                                                    'recall': self.recall,
                                                                    'f1score': self.f1score}) 
    def precision(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def recall(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def fbeta(self, y_true, y_pred, beta=1):
        if beta < 0:
            raise ValueError('The lowest choosable beta is zero (only precision).')

        # If there are no true positives, fix the F score at 0 like sklearn.
        if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
            return 0

        p = self.precision(y_true, y_pred)
        r = self.recall(y_true, y_pred)
        bb = beta ** 2
        fbeta = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
        return fbeta

    def f1score(self, y_true, y_pred):
        return self.fbeta(y_true, y_pred, beta=1)

In [4]:
class TextPreprocess():
    
    def tokenize_only(self, text):
        tokenizer = TweetTokenizer()
        text = re.sub(r'#', '# ', text)
        tokens = tokenizer.tokenize(text)
        return tokens
    
    def tokenize_with_lower(self, text):
        return [word.lower() for word in tokenize.WordPunctTokenizer().tokenize(text)]

In [5]:
class Embeddings():
    
    def __init__(self):
        self.embeddings = KeyedVectors.load_word2vec_format(str(PATH_TO_EMBEDDINGS))
        self.scaler = joblib.load(str(PATH_TO_SCALER))
        
    def get_vector_embeddings(self, tokens):
        embeddings_dim = self.embeddings.vector_size
        X = np.zeros((embeddings_dim), dtype=np.float32)
        empty_tokens = []
        tokens_embeddings = []
        for t in tokens:
            try:
                tokens_embeddings.append(self.embeddings.get_vector(t))
            except KeyError:
                tokens_embedding = self.get_embeddings_of_error_token(t, tokens_embeddings, empty_tokens)
        if len(tokens_embeddings) > 0:
            mean_embeddings = np.mean(tokens_embeddings, axis=0)
            X = mean_embeddings
        if len(empty_tokens) > 0:
            print(f'Empty tokens: {empty_tokens}')
        return X
    
    def get_embeddings_of_error_token(self, token, tokens_embeddings, empty_tokens):
        tokens = textpreprocess.tokenize_with_lower(token)
        for t in tokens:
            try:
                tokens_embeddings.append(self.embeddings.get_vector(t))
            except KeyError:
                empty_tokens.append(t)
        return tokens_embeddings

In [6]:
%%time

mlmodel = MLModel()
textpreprocess = TextPreprocess()
embeddings = Embeddings()

CPU times: user 2min 15s, sys: 1.76 s, total: 2min 17s
Wall time: 2min 17s


In [7]:
text = 'всё так хорошо!'

In [13]:
text_tok = textpreprocess.tokenize_only(text)
text_vec = embeddings.get_vector_embeddings(text_tok)
# text_vec_scale = embeddings.scaler.transform(text_vec.reshape(1, 300))
text_vec_scale = embeddings.scaler.transform(text_vec.reshape(1, int(embeddings.embeddings.vector_size)))

In [14]:
pred = mlmodel.model.predict(text_vec_scale)[0]

In [15]:
max_pred = max(pred)
max_pred_position = np.where(pred==max_pred)[0][0]
mlmodel.mlb.classes_[max_pred_position]

'positive'

# Testing

In [119]:
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
DATASET_DIR = DATA_DIR.joinpath('dataset/')

In [120]:
df_test = pd.read_csv(DATASET_DIR.joinpath('rusentiment_test.csv'))

In [121]:
samples_test = []
for message in df_test.text:
    samples_test.append(textpreprocess.tokenize_only(message))
    
labels_test = df_test.label.values

In [122]:
X_test = []
for sample in samples_test:
    X_test.append(embeddings.get_vector_embeddings(sample))

Empty tokens: [':)']
Empty tokens: ['id12210212', 'елисеем']
Empty tokens: ['самоунижение']
Empty tokens: ['мангаром']
Empty tokens: ['оскобление']
Empty tokens: ['ксюху']
Empty tokens: ['жевто', 'блакитные']
Empty tokens: ['отнятых']
Empty tokens: ['каверн', 'каверна']
Empty tokens: ['сейчааас']
Empty tokens: ['hardy']
Empty tokens: ['превередлевый']
Empty tokens: ['куринов']
Empty tokens: ['"', '"']
Empty tokens: ['шерхана', '=)']
Empty tokens: [':-)']
Empty tokens: [';)', ':)']
Empty tokens: ['zufällig', 'supermarkt', 'zufällig']
Empty tokens: ['люблюкогда']
Empty tokens: ['заболiв']
Empty tokens: ['извращёная']
Empty tokens: ['🏻']
Empty tokens: ['субтропиков']
Empty tokens: ['"', 'морозною', '"']
Empty tokens: ['id27975702']
Empty tokens: ['трансхондральный', 'медиального', 'крестообразной', 'подколенной']
Empty tokens: ['"', '"', 'соленоетесто']
Empty tokens: ['протоптанные']
Empty tokens: ['южке']
Empty tokens: [':)']
Empty tokens: ['разрешающая']
Empty tokens: ['анахронизмом', '

In [123]:
X_test_scale = embeddings.scaler.transform(X_test)

In [124]:
preds = mlmodel.model.predict(X_test_scale)

In [125]:
y_preds = []
for pred in preds:
    max_pred = max(pred)
    max_pred_position = np.where(pred==max_pred)[0][0]
    y_preds.append(mlmodel.mlb.classes_[max_pred_position])

In [126]:
label_encoder = joblib.load(str(DATA_DIR.joinpath('label_encoder.pkl')))

In [127]:
y_test = label_encoder.transform(labels_test)
y_preds_encode = label_encoder.transform(y_preds)

In [128]:
y_true = y_test
y_pred = y_preds_encode

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted', pos_label=1)
precision = precision_score(y_true, y_pred, average='weighted', pos_label=1)
recall = recall_score(y_true, y_pred, average='weighted', pos_label=1)

print('MLP model: \n accuracy - %s, \n f1 - %s, \n precision - %s, \n recall - %s' % (accuracy, f1, precision, recall))

MLP model: 
 accuracy - 0.745871250421301, 
 f1 - 0.7385383169719454, 
 precision - 0.7402676596273022, 
 recall - 0.745871250421301
