In [1]:
import warnings

warnings.filterwarnings('ignore')

# Подготовка данных

### Функция для чтения датасета

In [2]:
def read_data(fpath):
    data = []
    with open(fpath, 'r') as f:
        for line in f:
            ex = {}
            for subline in line[1:-2].replace('"', '').split(', '):
                key, value = subline.split(': ')
                try:
                    ex[key] = int(value)
                except Exception:
                    ex[key] = value
            data.append(ex)
    return data

В коллекции уже представлен обработанный датасет. Произведена лемматизация, удалены стоп-слова и знаки препинания. Поэтому попробуем использовать уже подготовленные данные.

In [3]:
train_data = read_data('imdb/lemmatized_wo_stopwords/train')

#### Сразу посмотрим на то, как связаны label и score

In [4]:
import numpy as np

In [5]:
ones_twos = np.zeros((10, 2))
for s in train_data:
    i = s['score'] - 1
    j = s['label'] - 1
    ones_twos[i, j] += 1
ones_twos

array([[   0., 5100.],
       [   0., 2284.],
       [   0., 2420.],
       [   0., 2696.],
       [   0.,    0.],
       [   0.,    0.],
       [2496.,    0.],
       [3009.,    0.],
       [2263.,    0.],
       [4732.,    0.]])

Видим, что score 5 и 6 отсутствует, высоким оценкам (7-10) соответствует label 1, а низким (1-4) - 2.
Поэтому задача определения score автоматически решает задачу определения label.

### Посмотрим на данные --- количество, среднюю и минимальную длину текстов.

In [6]:
count_data = len(train_data)
data_length = sorted([len(sample['text'].split()) for sample in train_data])
print('Count of samples: {}\nMean length: {}\nStd: {}\nMinimum: {}\n10%: {}\n25%: {}\n50%: {}\n'
      '75%: {}\n90%: {}\nMaximum: {}'.format(count_data, np.mean(data_length), np.std(data_length), 
                                             data_length[0], data_length[count_data//10], 
                                             data_length[count_data//4], data_length[count_data//2], 
                                             data_length[count_data * 3//4], data_length[count_data * 9//10], 
                                             data_length[-1]))

Count of samples: 25000
Mean length: 117.6394
Std: 88.88958098472509
Minimum: 3
10%: 45
25%: 62
50%: 87
75%: 143
90%: 233
Maximum: 1416


Видно, что разброс в данных большой. Вспоминая специфику данных -- отзывы к фильмам -- понимаем, что большинство пользователей пишут коротко, и лишь некоторые растекаются мысью по древу.
Во всяком случае, это не твиттер с 30-40 словами, поэтому можно ожидать приемлемых результатов.

### Функция для перевода данных в формат BigARTM

In [7]:
def prepare_train_data_for_bigartm(data, bigartm_data_path):
    with open(bigartm_data_path, 'w') as f:
        for i, sample in enumerate(data):
            f.write('review_{} {} |@score {}\n'.format(i, sample['text'], sample['score']))

In [8]:
def prepare_test_data_for_bigartm(data, bigartm_data_path):
    with open(bigartm_data_path, 'w') as f:
        for i, sample in enumerate(data):
            f.write('review_{} {}\n'.format(i, sample['text']))

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_set, val_set = train_test_split(train_data, test_size=0.2)
prepare_train_data_for_bigartm(train_set, 'train.txt')
prepare_test_data_for_bigartm(val_set, 'val.txt')
train_scores = [x['score'] for x in train_set]
val_scores = [x['score'] for x in val_set]

## Построим модель BigARTM

### Сначала определим vectorizer и сохраним словарь

In [11]:
import artm
import pyLDAvis

artm.ARTM(num_topics=1)
print(artm.version())

0.9.2


In [12]:
bv_train = artm.BatchVectorizer(data_path='train.txt', data_format='vowpal_wabbit', 
                                batch_size=10000, target_folder='train_batches')

  and should_run_async(code)


In [13]:
bv_val = artm.BatchVectorizer(data_path='val.txt', data_format='vowpal_wabbit', 
                              batch_size=10000, target_folder='val_batches')

  and should_run_async(code)


In [14]:
dictionary = artm.Dictionary()
dictionary.gather(data_path='train_batches')

  and should_run_async(code)


### Инициализируем модель и добавим метрики качества

In [15]:
model = artm.ARTM(num_topics=100, dictionary=dictionary, class_ids={'@default_class': 1.0, '@score': 5.0})

model.scores.add(artm.TopTokensScore(name='top-tokens', num_tokens=15))
model.scores.add(artm.SparsityPhiScore(name='sparsity', class_id='@score'))
model.scores.add(artm.PerplexityScore(name='perplexity', dictionary=dictionary))

  and should_run_async(code)


### В качестве первой попытки обучим модель без регуляризаторов

In [16]:
for i in range(50):
    model.fit_offline(bv_train, num_collection_passes=1)
    print('Iteration {}: sparsity = {}, perplexity = {}'.format(\
        i, model.score_tracker['sparsity'].value[-1], model.score_tracker['perplexity'].value[-1]
    ))

  and should_run_async(code)


Iteration 0: sparsity = 0.0, perplexity = 469034.9375
Iteration 1: sparsity = 0.0, perplexity = 16994.94140625
Iteration 2: sparsity = 0.0, perplexity = 10372.28125
Iteration 3: sparsity = 0.0, perplexity = 6829.06591796875
Iteration 4: sparsity = 0.0, perplexity = 5521.0244140625
Iteration 5: sparsity = 0.0, perplexity = 5013.30810546875
Iteration 6: sparsity = 0.0, perplexity = 4791.04833984375
Iteration 7: sparsity = 0.0, perplexity = 4684.14111328125
Iteration 8: sparsity = 0.0, perplexity = 4634.48046875
Iteration 9: sparsity = 0.0, perplexity = 4618.26806640625
Iteration 10: sparsity = 0.0012499999720603228, perplexity = 4622.0087890625
Iteration 11: sparsity = 0.0062500000931322575, perplexity = 4637.11865234375
Iteration 12: sparsity = 0.013749999925494194, perplexity = 4657.953125
Iteration 13: sparsity = 0.028750000521540642, perplexity = 4680.97509765625
Iteration 14: sparsity = 0.04625000059604645, perplexity = 4704.58056640625
Iteration 15: sparsity = 0.057500001043081284,

Видим, что с 10 итерации перплексия начинает расти -- эффект переобучения. Это плохо, надо использовать регуляризаторы. Чтобы не копировать постоянно код, напишем функцию, создающую и обучающую модель.

# Функция для создания и обучения модели с регуляризаторами

In [16]:
def create_and_fit_model_with_regularizers(bv_train, num_topics, epochs, 
                                           tau_def, tau_score, score_idx, verbose=1):
    model = artm.ARTM(num_topics=num_topics, dictionary=dictionary, 
                      class_ids={'@default_class': 1.0, '@score': score_idx})
    model.scores.add(artm.TopTokensScore(name='top-tokens', num_tokens=15))
    model.scores.add(artm.SparsityPhiScore(name='sparsity', class_id='@score'))
    model.scores.add(artm.PerplexityScore(name='perplexity', dictionary=dictionary))
    
    model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_default', 
                                                           tau=tau_def,
                                                           class_ids=['@default_class']))
    model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_score', 
                                                           tau=tau_score,
                                                           class_ids=['@score']))
    old_sparsity = -1
    old_perplexity = -1
    stop = 10
    for i in range(epochs):
        model.fit_offline(bv_train, num_collection_passes=1)
        sparsity = model.score_tracker['sparsity'].value[-1]
        perplexity = model.score_tracker['perplexity'].value[-1]
        if verbose == 2:
            print('Iteration {}: sparsity = {}, perplexity = {}'.format(i, sparsity, perplexity))
        if sparsity < old_sparsity or sparsity == old_sparsity and perplexity > old_perplexity:
            stop -= 1
            if stop == 0:
                break
        else:
            old_sparsity = sparsity
            old_perplexity = perplexity
    if verbose == 1:
        print('Iteration last: sparsity = {}, perplexity = {}'.format(sparsity, perplexity))
    return model

  and should_run_async(code)


## Функция, которая для заданных параметров обучает модель и оценивает её на валидационных данных

In [17]:
from sklearn.metrics import f1_score, accuracy_score

  and should_run_async(code)


In [19]:
def evaluate_hyperparameters(bv_train, bv_val, scores_train, scores_val,
                             num_topics, epochs, tau_def, tau_score, score_idx, verbose=1, save_model=False):
    model = create_and_fit_model_with_regularizers(bv_train, num_topics, epochs, 
                                                   tau_def, tau_score, score_idx, verbose)
    p_cd_val = model.transform(batch_vectorizer=bv_val, predict_class_id='@score').T
    y_val_pred = [int(x) for x in p_cd_val.idxmax(axis=1).values]
    f1_val = f1_score(scores_val, y_val_pred, average='macro')
    acc_val = accuracy_score(scores_val, y_val_pred)
    # print('Val: f1_macro = {}, accuracy = {}'.format(f1_val, acc_val))
    p_cd_train = model.transform(batch_vectorizer=bv_train, predict_class_id='@score').T
    y_train_pred = [int(x) for x in p_cd_train.idxmax(axis=1).values]
    f1_train = f1_score(scores_train, y_train_pred, average='macro')
    acc_train = accuracy_score(scores_train, y_train_pred)
    # print('Train: f1_macro = {}, accuracy = {}'.format(f1_train, acc_train))
    if save_model:
        return {
            'model': model,
            'f1_val': f1_val,
            'acc_val': acc_val,
            'f1_train': f1_train,
            'acc_train': acc_train
        }
    else:
        del model
        return {
            'f1_val': f1_val,
            'acc_val': acc_val,
            'f1_train': f1_train,
            'acc_train': acc_train
        }

  and should_run_async(code)


In [24]:
evaluate_hyperparameters(bv_train, bv_val, train_scores, val_scores, 100, 30, 1e+6, 100, 5)

  and should_run_async(code)


Iteration last: sparsity = 0.9762499928474426, perplexity = 6163.5693359375


{'model': artm.ARTM(num_topics=100, num_tokens=71992, class_ids=['@default_class', '@score']),
 'f1_val': 0.1559919903532782,
 'acc_val': 0.1706,
 'f1_train': 0.42828186109613126,
 'acc_train': 0.44625}

In [20]:
del train_data
del train_set
del val_set

  and should_run_async(code)


In [None]:
results = {}
for num_topics in [50, 100, 200]:
    for tau_def in [1e+5, 1e+6, 1e+7]:
        for tau_score in [1, 5, 10, 20]:
            for score_idx in [1, 2, 5, 10]:
                key = '{} {} {} {}'.format(num_topics, tau_def, tau_score, score_idx)
                print(key)
                results[key] = evaluate_hyperparameters(bv_train, bv_val, train_scores, val_scores, 
                                                        num_topics, 50, tau_def, tau_score, score_idx)
                print(results[key])

  and should_run_async(code)


50 100000.0 1 1
Iteration last: sparsity = 0.05249999836087227, perplexity = 10421.568359375
{'f1_val': 0.21434314258570103, 'acc_val': 0.3568, 'f1_train': 0.10434035542143785, 'acc_train': 0.17085}
50 100000.0 1 2
Iteration last: sparsity = 0.10750000178813934, perplexity = 9152.359375
{'f1_val': 0.22220412176353266, 'acc_val': 0.3332, 'f1_train': 0.11867192260945503, 'acc_train': 0.16255}
50 100000.0 1 5
Iteration last: sparsity = 0.5575000047683716, perplexity = 6220.5546875
{'f1_val': 0.24988371153878702, 'acc_val': 0.343, 'f1_train': 0.11994723293992603, 'acc_train': 0.14755}
50 100000.0 1 10
Iteration last: sparsity = 0.7799999713897705, perplexity = 4130.38232421875
{'f1_val': 0.2300062325589405, 'acc_val': 0.3526, 'f1_train': 0.12146620870690021, 'acc_train': 0.14335}
50 100000.0 5 1
Iteration last: sparsity = 0.06750000268220901, perplexity = 10419.9501953125
{'f1_val': 0.2160978169815948, 'acc_val': 0.3578, 'f1_train': 0.10465286911732574, 'acc_train': 0.17035}
50 100000.0 5 