# Import needed libraries

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from sklearn.metrics import roc_auc_score, classification_report
from collections import Counter

from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras.layers import Dense, Activation, Embedding, Bidirectional, LSTM, GlobalAveragePooling1D, Conv1D, Dropout
from keras.regularizers import l1_l2
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras
import gensim
import tensorflow as tf

# Data loading

In [3]:
data = pd.read_csv('feedback.csv')
columns = data.columns[:5]
data = data[columns]
print('data shape:', data.shape)
data.head()

data shape: (18341, 5)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,rating,product_id,name,date,feedback
0,1.0,8342,Александр,2017-04-12,"6 входов, предохранитель"
1,5.0,8342,Елена,2015-08-04,Я являюсь пользователем Пилотов уже больше 10 ...
2,5.0,5311,Леонид,2017-07-16,хорошо мелет
3,4.0,5311,Сергей,2017-06-28,Компактная
4,5.0,5311,Ольга,2017-01-21,Цена и качество


# Preprocessing

In [4]:
# target extracting
y = data.pop('rating')
print("target size:", y.shape[0])

# reducing the number of labels to 5 by rounding fractional values
y_round = round(y)
num_labels = len(set(y_round))
print("\nnumber of target labels:", num_labels)

# providing target into one-hot form
y_arr = ((np.arange(num_labels) + 1) == np.array(y_round)[:, None]).astype(
    np.int8)

# deleting non informative columns
data.drop(['product_id', 'name', 'date'], 1, inplace=True)

max_features = 10000
maxlen = 100

encoded_data = [
    one_hot(feedback, max_features) for feedback in list(data['feedback'].str.lower())
]
padded_data = pad_sequences(encoded_data, maxlen=maxlen)

target size: 18341

number of target labels: 5


## Train-val-test split

We shuffle the data because we are not going to capture any time-related dependencies. But, of course, it would be better to sort the data and split it without shuffle

In [6]:
X_train_dev, X_test, y_train_dev, y_test = train_test_split(padded_data,
                                                            y_arr,
                                                            stratify=y_arr,
                                                            test_size=0.1,
                                                            random_state=0)

X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev,
                                                  y_train_dev,
                                                  stratify=y_train_dev,
                                                  test_size=0.25,
                                                  random_state=0)
print('\nX_train:', X_train.shape, ' y_train:', y_train.shape,
      '\nX_dev:', X_dev.shape, ' y_dev:', y_dev.shape)
print('X_test:', X_test.shape, ' ny_test:', y_test.shape)


X_train: (12379, 100)  y_train: (12379, 5) 
X_dev: (4127, 100)  y_dev: (4127, 5)
X_test: (1835, 100)  ny_test: (1835, 5)


# Metrics

In [7]:
def metrics_evaluate(X_test, y_test, model):
    
    y_pred = model.predict(X_test)
    # the weirdest way of calculating accuracy
    correct = 0
    for i in range(len(y_pred)):
        if (np.argmax(y_pred[i]) == np.argmax(y_test[i])):
            correct += 1
    accuracy = correct / len(y_test)
    
    roc_auc = roc_auc_score(y_test, y_pred, average='micro')
    
    df = pd.DataFrame([[accuracy, roc_auc]],
                      columns=['accuracy', 'roc_auc'])

    return df

# Models

## Recurrent neural network for sentiment prediction

In [8]:
input_size = X_train.shape[1]

#initializing random initializer
random_initializer = keras.initializers.RandomUniform(minval=-0.5,
                                                      maxval=0.5,
                                                      seed=42)

# Initialising the NN
model = Sequential()

# layers
model.add(Embedding(input_dim=max_features, 
                    output_dim=128, 
                    input_length=maxlen, 
                    name='embedding_layer'))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))
model.add(
    Dense(num_labels,
          kernel_initializer=random_initializer,
          activation='softmax'))

# summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 100, 128)          1280000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               98816     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
Total params: 1,379,461
Trainable params: 1,379,461
Non-trainable params: 0
_________________________________________________________________


In [20]:
N_EPOCHS = 10
LEARNING_RATE = 0.001

#initializing optimizer
adam_opt = Adam(lr=LEARNING_RATE,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=None,
                decay=0.0,
                amsgrad=False)

# Compiling the NN
model.compile(optimizer=adam_opt,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Defining early stopping callback
earlystop = EarlyStopping(monitor='val_accuracy',
                          min_delta=0.01,
                          patience=2,
                          restore_best_weights=True,
                          verbose=1,
                          mode='max')

# Defining checkpoint callback
filepath = './RNN.hdf5'
checkpoint = ModelCheckpoint(filepath,
                             monitor='val_accuracy',
                             verbose=0,
                             save_best_only=True,
                             mode='max')

# Train the NN
history = model.fit(X_train,
                    y_train,
                    validation_data=(X_dev, y_dev),
                    batch_size=32,
                    epochs=N_EPOCHS,
                    callbacks=[earlystop, checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Restoring model weights from the end of the best epoch.
Epoch 00004: early stopping


In [21]:
RNN_result = metrics_evaluate(X_test, y_test, model)
RNN_result.rename({0: "RNN"}, axis='index', inplace=True)
RNN_result

Unnamed: 0,accuracy,roc_auc
RNN,0.585286,0.823126


In [22]:
model.predict(X_test)

array([[5.50192672e-05, 6.42105646e-04, 1.08773194e-04, 1.80777541e-04,
        9.99013305e-01],
       [2.45709361e-06, 9.81260906e-04, 7.48079270e-04, 2.39994060e-05,
        9.98244166e-01],
       [3.93233222e-07, 2.99892463e-07, 2.06866980e-06, 9.99993563e-01,
        3.64092693e-06],
       ...,
       [6.04394241e-04, 4.47876460e-04, 8.25484216e-01, 4.17288952e-02,
        1.31734625e-01],
       [2.20873975e-03, 2.29806552e-04, 9.12868069e-04, 4.87306342e-03,
        9.91775513e-01],
       [5.40591171e-03, 8.06183994e-01, 1.47168770e-01, 5.43230213e-03,
        3.58090885e-02]], dtype=float32)

just for check that we may reload the best model

In [23]:
loaded_model = load_model('./RNN.hdf5')

RNN_result = metrics_evaluate(X_test, y_test, loaded_model)
RNN_result.rename({0: "RNN"}, axis='index', inplace=True)
RNN_result

Unnamed: 0,accuracy,roc_auc
RNN,0.585286,0.823126


In [24]:
loaded_model.predict(X_test)

array([[5.50192672e-05, 6.42105646e-04, 1.08773194e-04, 1.80777541e-04,
        9.99013305e-01],
       [2.45709361e-06, 9.81260906e-04, 7.48079270e-04, 2.39994060e-05,
        9.98244166e-01],
       [3.93233222e-07, 2.99892463e-07, 2.06866980e-06, 9.99993563e-01,
        3.64092693e-06],
       ...,
       [6.04394241e-04, 4.47876460e-04, 8.25484216e-01, 4.17288952e-02,
        1.31734625e-01],
       [2.20873975e-03, 2.29806552e-04, 9.12868069e-04, 4.87306342e-03,
        9.91775513e-01],
       [5.40591171e-03, 8.06183994e-01, 1.47168770e-01, 5.43230213e-03,
        3.58090885e-02]], dtype=float32)

## Fasttext with n-grams for text classification

https://github.com/ShreyaKhare/imdb_fasttext/blob/master/imdb_fasttext.py

In [25]:
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
#                     new_list.append(token_indice[ngram])
                    new_list = np.append(new_list, (token_indice[ngram]))
        new_sequences.append(new_list)

    return new_sequences


def build_model(maxlen, max_features, embedding=None):
    model = Sequential()

    #initializing random initializer
    random_initializer = keras.initializers.RandomUniform(minval=-0.5,
                                                          maxval=0.5,
                                                          seed=42)

    if (embedding == None):
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        model.add(
            Embedding(max_features,
                      embedding_dims,
                      input_length=maxlen,
                      embeddings_initializer=random_initializer))
    else:
        model.add(embedding)

    # we add a GlobalAveragePooling1D, which will average the embeddings
    # of all words in the document
    model.add(GlobalAveragePooling1D())

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(
        Dense(num_labels,
              kernel_initializer=random_initializer,
              activation='softmax'))

    #initializing optimizer
    adam_opt = Adam(lr=LEARNING_RATE,
                    beta_1=0.9,
                    beta_2=0.999,
                    epsilon=None,
                    decay=0.0,
                    amsgrad=False)

    # Compiling the NN
    model.compile(optimizer=adam_opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [26]:
batch_size = 32
embedding_dims = 50
LEARNING_RATE = 0.001
N_EPOCHS = 100

### 1-gram

In [27]:
ngram_range = 1

# Defining early stopping callback
earlystop = EarlyStopping(monitor='val_accuracy',
                          min_delta=0.001,
                          patience=4,
                          verbose=1,
                          restore_best_weights=True,
                          mode='max')

# Defining checkpoint callback
filepath = './fasttext_1_gram.hdf5'
checkpoint = ModelCheckpoint(filepath,
                             monitor='val_accuracy',
                             verbose=0,
                             save_best_only=True,
                             mode='max')

In [28]:
print('Build model...')
model = build_model(maxlen, max_features)

# summary
model.summary()

Build model...
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           500000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 255       
Total params: 500,255
Trainable params: 500,255
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=N_EPOCHS,
          validation_data=(X_dev, y_dev),
          callbacks=[checkpoint, earlystop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping


<tensorflow.python.keras.callbacks.History at 0x1f6eb114cc0>

In [30]:
gram1_result = metrics_evaluate(X_test, y_test, model)
gram1_result.rename({0: "1 gram"}, axis='index', inplace=True)
gram1_result

Unnamed: 0,accuracy,roc_auc
1 gram,0.640872,0.888153


In [31]:
loaded_model = load_model('./fasttext_1_gram.hdf5')

gram1_result = metrics_evaluate(X_test, y_test, loaded_model)
gram1_result.rename({0: "1 gram"}, axis='index', inplace=True)
gram1_result

Unnamed: 0,accuracy,roc_auc
1 gram,0.640872,0.888153


### 2-grams

In [32]:
ngram_range = 2

if ngram_range > 1:
    print('Adding {}-gram features\n'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in X_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features_2_gram = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    X_train_2_gram = add_ngram(X_train, token_indice, ngram_range)
    X_dev_2_gram = add_ngram(X_dev, token_indice, ngram_range)
    X_test_2_gram = add_ngram(X_test, token_indice, ngram_range)
    
    train_avg_len = np.mean(list(map(len, X_train_2_gram)), dtype=int)
    print('Average train sequence length: {}'.format(train_avg_len))
    dev_avg_len = np.mean(list(map(len, X_dev_2_gram)), dtype=int)
    print('Average dev sequence length: {}'.format(dev_avg_len))
    test_avg_len = np.mean(list(map(len, X_test_2_gram)), dtype=int)
    print('Average test sequence length: {}\n'.format(test_avg_len))

Adding 2-gram features

Average train sequence length: 199
Average dev sequence length: 181
Average test sequence length: 180



In [33]:
# Making sequences the same length

maxlen_2gram = max(train_avg_len, dev_avg_len, test_avg_len)
X_train_2_gram = pad_sequences(X_train_2_gram, maxlen=maxlen_2gram)
X_dev_2_gram = pad_sequences(X_dev_2_gram, maxlen=maxlen_2gram)
X_test_2_gram = pad_sequences(X_test_2_gram, maxlen=maxlen_2gram)
print('X_train shape:', X_train_2_gram.shape)
print('X_dev shape:', X_dev_2_gram.shape)
print('X_test shape:', X_test_2_gram.shape, '\n')

X_train shape: (12379, 199)
X_dev shape: (4127, 199)
X_test shape: (1835, 199) 



In [56]:
model = build_model(maxlen=maxlen_2gram, 
                    max_features=max_features_2_gram)

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 199, 50)           14389200  
_________________________________________________________________
global_average_pooling1d_3 ( (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 255       
Total params: 14,389,455
Trainable params: 14,389,455
Non-trainable params: 0
_________________________________________________________________


In [57]:
# Defining early stopping callback
earlystop = EarlyStopping(monitor='val_loss',
                          min_delta=0.001,
                          patience=4,
                          verbose=1,
                          restore_best_weights=True,
                          mode='min')

# Defining checkpoint callback
filepath = './fasttext_2_gram.hdf5'
checkpoint = ModelCheckpoint(filepath,
                             monitor='val_loss',
                             verbose=0,
                             save_best_only=True,
                             mode='min')

model.fit(X_train_2_gram,
          y_train,
          batch_size=batch_size,
          epochs=N_EPOCHS,
          validation_data=(X_dev_2_gram, y_dev),
          callbacks=[earlystop, checkpoint])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping


<tensorflow.python.keras.callbacks.History at 0x1f68f5e2c88>

In [58]:
gram2_result = metrics_evaluate(X_test_2_gram, y_test, model)
gram2_result.rename({0: "2 gram"}, axis='index', inplace=True)
gram2_result

Unnamed: 0,accuracy,roc_auc
2 gram,0.621253,0.888147


In [61]:
loaded_model = load_model('./fasttext_2_gram.hdf5')

gram2_result = metrics_evaluate(X_test_2_gram, y_test, loaded_model)
gram2_result.rename({0: "2 gram"}, axis='index', inplace=True)
gram2_result

Unnamed: 0,accuracy,roc_auc
2 gram,0.621253,0.888147


3-gram model gave almost the same results as the 2-gram

# Pre-trained word embedding

Loading word2vec model

In [44]:
%%time

# https://rusvectores.org/ru/models/
model_path = 'ruwikiruscorpora_superbigrams_2_1_2.vec'
keyed_vectors = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False, limit=10000)

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
weights = keyed_vectors.vectors      

# set `trainable` as `False` to use the pretrained word embedding
# No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights
embedding = Embedding(
    input_dim=weights.shape[0], output_dim=weights.shape[1],
    weights=[weights], trainable=False
)

Wall time: 3.33 s


Adding loaded model as embedding layer

In [45]:
pre_trained_model = Sequential()

#initializing random initializer
random_initializer = keras.initializers.RandomUniform(minval=-0.5,
                                                      maxval=0.5,
                                                      seed=42)

pre_trained_model.add(embedding)

pre_trained_model.add(
    Conv1D(128, 3, kernel_regularizer=l1_l2(1e-7, 1e-7), padding='same'))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
pre_trained_model.add(GlobalAveragePooling1D())

pre_trained_model.add(
    Dense(num_labels,
          kernel_initializer=random_initializer,
          activation='softmax'))

#initializing optimizer
adam_opt = Adam(lr=LEARNING_RATE,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=None,
                decay=0.0,
                amsgrad=False)

# Compiling the NN
pre_trained_model.compile(optimizer=adam_opt,
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])

pre_trained_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 300)         3000000   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         115328    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 645       
Total params: 3,115,973
Trainable params: 115,973
Non-trainable params: 3,000,000
_________________________________________________________________


Just for interest let's check pre-trained model accuracy

In [46]:
pre_trained_result = metrics_evaluate(X_test, y_test, pre_trained_model)
pre_trained_result.rename({0: "pre-trained word2vec"}, axis='index', inplace=True)
pre_trained_result

Unnamed: 0,accuracy,roc_auc
pre-trained word2vec,0.59782,0.745075


In [49]:
# Defining early stopping callback
earlystop = EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=5, 
                          verbose=1, mode='max')

# Defining checkpoint callback
filepath='./pre_trained_w2v.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', 
                             verbose=0, save_best_only=True, mode='max')

In [50]:
pre_trained_model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=N_EPOCHS,
          validation_data=(X_dev, y_dev),
          callbacks=[earlystop, checkpoint])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping


<tensorflow.python.keras.callbacks.History at 0x1f68b40d5c0>

In [51]:
pre_trained_result = metrics_evaluate(X_test, y_test, pre_trained_model)
pre_trained_result.rename({0: "pre-trained word2vec"}, axis='index', inplace=True)
pre_trained_result

Unnamed: 0,accuracy,roc_auc
pre-trained word2vec,0.589101,0.821746


In [52]:
loaded_model = load_model('./pre_trained_w2v.hdf5')

pre_trained_result = metrics_evaluate(X_test, y_test, loaded_model)
pre_trained_result.rename({0: "pre-trained word2vec"}, axis='index', inplace=True)
pre_trained_result

Unnamed: 0,accuracy,roc_auc
pre-trained word2vec,0.597275,0.820582


# Metrics comparison and conclusion

In [62]:
result = pd.concat([RNN_result, gram1_result,gram2_result, pre_trained_result])
result

Unnamed: 0,accuracy,roc_auc
RNN,0.585286,0.823126
1 gram,0.640872,0.888153
2 gram,0.621253,0.888147
pre-trained word2vec,0.597275,0.820582


**Results are more or less similar for all models. Best scores are achieved by n-gram models. 2-gram fasttext model is noticeably slower than others because of the increased vocabulary. Finally, our winner is 1-gram fasttext model trained from scratch** 