In [28]:
#рассмотрен датасет из 30 корпусов(из рубрик Политика и Происшествия), в каждом минимум 8 документов
#подробнее об анализе корпусов можно узнать в ноутбуке toloka_tsv_analysis.ipynb
#подготовка данных в формате vowpal wabbit в ноутбуке Data_prepare.ipynb
#основной датасет и вспомогательные файлы, полученные Е.Милюта, взяты по этой ссылке: https://drive.google.com/drive/u/0/folders/1nu08Cb7TlFUmWlDFvfiY0r96NTilqXZS

## Imports

In [29]:
!pip install pymorphy2
!pip install stop-words
!pip install dill
!pip install conllu
!pip install reports
!pip install bigartm10

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [31]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import importlib, itertools
import pickle, dill
import collections
from collections import Counter
from collections import defaultdict
from itertools import combinations

import codecs, re, os, sys, json, string, csv, datetime
from typing import Dict, List

import pymorphy2
import nltk
from stop_words import get_stop_words
from nltk.corpus import stopwords
from conllu import parse
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, cosine_similarity

import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

sys.path.append('/home/')

import reports
import artm
from experiment import topic_model, so_balance
from metrics import get_report
from nltk.corpus import stopwords 
nltk.download('stopwords') 
stop_words = set(stopwords.words("russian")).union(set(get_stop_words('russian')))

#p = '/content/gdrive/MyDrive/data/module/module/data/'
p = '/home/'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
import zipfile

try:
  archive = '/home/folder_by_topics_wv.zip'
  with zipfile.ZipFile(archive, 'r') as zip_file:
    zip_file.extractall('/home/vw/')
except:
  print('File already unzipped')

## Preparing

In [33]:
morph = pymorphy2.MorphAnalyzer(lang='ru')
def lemmatize(word):
    return morph.parse(word)[0].normal_form
def text_modifier(text:str)->str:
    text = text.replace('\xa0', ' ')
    reg = re.compile(r'[^а-яА-ЯёЁa-zA-Z0-9 ]')
    text = reg.sub('', text)
    return text

def tf_idf_text_proc(text):
    text = text_modifier(text) #clean up text from punct and another trash
    text = nltk.word_tokenize(text, language='russian') #split in corret way by words
    text = ' '.join(map(lemmatize, text)) #lemmatize each word and join for tf-idf vectorizer form
    return text

def flatten(t):
    return [item for sublist in t for item in sublist]

In [34]:
def experiment(class_ids, dictionary, batch_vectorizer, y_true, n_topics, num_back, tau, tf, ds_type:'[new,old]' = 'old'):
    model = topic_model(class_ids,dictionary, n_topics, num_back, tau, tf)
    model.fit_offline(batch_vectorizer=batch_vectorizer,num_collection_passes=40)

    theta = model.get_theta()
    X = theta.to_numpy()[:-int(ds_type == 'old') * num_back].T

    y_pred = X.argmax(axis=1)
    metrics = get_report(y_true, y_pred)
    return metrics

In [35]:
def get_class_ids(
    dictionary, 
    batch_vectorizer, 
    y_true, n_topics, 
    num_back, tau, tf,
    balancer = {},
    ds_type = 'old',
    spo=True, roles=True, tonal=True):
    scores = []
    best = 0
    
    #spo = subject-predicat-object
    #roles = Fillmore
    #tonal = negative + positive

    if spo+roles+tonal == 3:
        for x in tqdm(np.arange(0.05,1.04,0.05)):
            for y in (np.arange(0,1-x,0.05)):
                class_ids = {
                    'subjects': y*balancer['subjects'],
                    'objects': y*balancer['objects'],
                    'pairs': x,
                    'neg_pol': (1-x-y)*balancer['neg_pol'],
                    'pos_pol': (1-x-y)*balancer['pos_pol']
                }
                
                metrics = experiment(class_ids, dictionary, batch_vectorizer, y_true, n_topics, num_back, tau, tf, ds_type)
                metrics.update({"class_ids":class_ids})
                scores.append(metrics)
                f1 = metrics['F1']
                if f1>best:
                    best = f1
                    best_p = class_ids
                
    elif spo+roles+tonal == 2:
        for x in tqdm(np.arange(0,1.04,0.05)):
            class_ids = {
                            'subjects': x*spo*balancer['subjects'],
                            'objects': x*spo*balancer['objects'],
                            'pairs': ((1-x)*spo + x*tonal)*roles,
                            'neg_pol': ((1-x)*roles + (1-x)*spo)*balancer['neg_pol']*tonal,
                            'pos_pol': ((1-x)*roles + (1-x)*spo)*balancer['pos_pol']*tonal
                        } 
           
            metrics = experiment(class_ids, dictionary, batch_vectorizer, y_true, n_topics, num_back, tau, tf, ds_type)
            metrics.update({"class_ids":class_ids})
            scores.append(metrics)
            f1 = metrics['F1']
            
            if f1>best:
                best = f1
                best_p = class_ids
                
    else:
        return
    
    return best, best_p, scores

# Toloka

## Data

In [36]:
!pip install grpcio
!pip install git+https://github.com/IINemo/isanlp.git
!git clone https://github.com/angular/angular-phonecat.git
!git config --global url."https://".insteadOf git://
!pip install git+https://github.com/IINemo/isanlp_srl_framebank.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/IINemo/isanlp.git
  Cloning https://github.com/IINemo/isanlp.git to /tmp/pip-req-build-0yysd7hp
  Running command git clone -q https://github.com/IINemo/isanlp.git /tmp/pip-req-build-0yysd7hp
fatal: destination path 'angular-phonecat' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/IINemo/isanlp_srl_framebank.git
  Cloning https://github.com/IINemo/isanlp_srl_framebank.git to /tmp/pip-req-build-p92qsq0j
  Running command git clone -q https://github.com/IINemo/isanlp_srl_framebank.git /tmp/pip-req-build-p92qsq0j
  Running command git submodule update --init --recursive -q


In [37]:
with open('/home/1_12_dataset_toloka.pickle', 'rb') as ds:
  input_ds = dill.load(ds)
#input_ds[0].keys()

In [38]:
toloka = pd.read_csv('/home/toloka.tsv', delimiter='\t').dropna(how='all').reset_index()
toloka = toloka[['INPUT:id', 'INPUT:text', 'OUTPUT:pole_name', 'ASSIGNMENT:worker_id', 'ASSIGNMENT:task_suite_id']]
toloka.columns = ['doc_id', 'text', 'lable', 'worker_id', 'text_group_id']
toloka = toloka[toloka['lable'].notna()]
toloka.loc[:, 'not_pol_flag'] = toloka['lable'] == 'Не поляризовано'
toloka.loc[:, 'not_in_class'] = toloka['lable'] == 'Не относится к теме'
toloka.loc[:, 'mark'] = toloka.groupby(
    ['worker_id', 'text_group_id']
)['lable'].rank('dense'
).astype(int).astype(str)

toloka.loc[:, ('n_in_group')] = toloka.groupby(
    ['worker_id', 'text_group_id']
)['doc_id'].rank('dense'
).astype(int).astype(str)

toloka = toloka.sort_values(
    by=['worker_id', 'text_group_id', 'n_in_group']
).reset_index(drop=True).reset_index()

nltk.download('punkt')
toloka['tf_idf_proc_text'] = toloka['text'].apply(tf_idf_text_proc)
toloka['doc_id'].nunique(), toloka.shape

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(452, (1312, 11))

In [39]:
with open('/home/toloka_embs.json', 'rb') as f:
  sentiment = json.load(f)

sentiment_mapper = dict((doc['doc_id'], doc['embeddings']) for doc in sentiment)

In [40]:
# Write in vw format

In [41]:
def create_line_from_counter(str_start, cnt_dict):
    cnt_dict = list(cnt_dict.items())
    res = f' |{str_start} '
    if len(cnt_dict) == 0:
        return res
    if isinstance(cnt_dict[0][0], tuple):
        f_ = lambda w: '_'.join(
            flatten(
                map(
                    lambda x: map(
                        lambda y: text_modifier(y), x.split()
                    ), w
                )
            )
        )   
    else:
        f_ = lambda w: "_".join(map(lambda y: text_modifier(y), w.split()))
    res += ' '.join([f'{f_(w)}:{c}' for w, c in cnt_dict if f_(w) != '' and f_(w) !=' '])
    return res

def write_group(sub_ds, f_path):
    file = f_i = open(f_path, 'w')
    for i, doc in enumerate(sub_ds):
        doc_id = doc['doc_id']
        neg_pol = create_line_from_counter('neg_pol', doc['sentiment']['neg'])
        pos_pol = create_line_from_counter('pos_pol', doc['sentiment']['pos']) 
        fillmore = create_line_from_counter('fillmore', doc['fillmore'])
        subjects = create_line_from_counter('subjects', doc['subjects'])
        objects = create_line_from_counter('objects', doc['objects'])

        write_str = str(i) + neg_pol + pos_pol + fillmore + subjects + objects + '\n'
        file.write(write_str)
    file.close()

In [42]:
mapping_to_group_name = dict(np.stack([toloka['doc_id'].values, toloka['text_group_id'].values]).T.tolist())
files_ = set(mapping_to_group_name.values())
for f_name in files_:
    os.makedirs(p + f'batches/{f_name}')
    f_path = p + f'vw/{f_name}_vw'
    sub_ds = sorted(filter(lambda x: x['text_group_id'] == f_name, input_ds), key=lambda x: int(x['doc_id']))
    write_group(sub_ds,f_path)


In [44]:
batch_vectorizers = dict( 
        (f_name, artm.BatchVectorizer(
            data_path=p + f'vw/{f_name}_vw',
            collection_name='',
            data_format='vowpal_wabbit', 
            batch_size = 100, 
            target_folder=p + f'batches/{f_name}'))
        for f_name in files_
    )

dictionaries = dict((f_name, batch_vectorizer.dictionary) for f_name, batch_vectorizer in batch_vectorizers.items())


In [45]:
toloka['mark_wo_trash'] = (
    toloka.set_index(
        ['text_group_id', 'worker_id']
    )['mark'].astype(int) -\
    toloka.groupby(
        ['text_group_id', 'worker_id']
    )[['not_pol_flag', 'not_in_class']
    ].max().sum(axis=1)
).values

In [46]:
classes_in_group = toloka.groupby(['text_group_id', 'worker_id'])['mark_wo_trash'].max().reset_index()
prediction_form = {}
for (gr_id), grouped in classes_in_group.groupby('text_group_id'):
    prediction_form[gr_id] = {}
    for m in grouped['mark_wo_trash'].unique():
        prediction_form[gr_id][m] = None

In [47]:
lost_idx_in_sorted = {}
for (w_id, t_g_id), doc_grouped in toloka.groupby(['worker_id', 'text_group_id']):
    doc_ids = sorted(map(lambda x: x[0], filter(lambda x: x[1] == t_g_id, mapping_to_group_name.items())))
    curr_doc_ids = set(doc_grouped['doc_id'])
    lost_idx_in_sorted[(w_id, t_g_id)] = [doc_id in curr_doc_ids for doc_id in doc_ids]

## Modeling

In [48]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [49]:
def topic_model(class_ids, dictionary, num_of_topics, tau, tf):

    names_of_topics = [str(x) for x in range(num_of_topics + 3)]    
    dictionary.filter(min_tf=tf, class_id='subjects')
    dictionary.filter(min_tf=tf, class_id='objects')
    dictionary.filter(min_tf=tf, class_id='pairs')

    model = artm.ARTM(num_topics=num_of_topics + 3,
                      cache_theta=True,
                      topic_names=names_of_topics,
                      class_ids=class_ids, 
                      reuse_theta = True,
                      #regularizers=regularizers_artm,
                      dictionary=dictionary,
                      seed=42
                     )
    
    model.scores.add(artm.TopTokensScore(name='top-tokens', num_tokens=10))
    
    model.scores.add(artm.PerplexityScore(name='PerplexityScore',
                                      dictionary=dictionary))
    
    model.scores.add(artm.SparsityPhiScore(name = 'SparcityPhiScore',
                                           topic_names=model.topic_names[1:]))

    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhiRegularizer',
                                                            class_ids=class_ids,
                                                            topic_names=model.topic_names[1:],tau = -tau))
    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SmoothPhiRegularizer',
                                                            class_ids=class_ids,
                                                            topic_names=model.topic_names[0],tau = tau))


    model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorRegularizer',
                                                          class_ids=class_ids,
                                                          topic_names=model.topic_names[2:], tau=tau))
    model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseThetaRegularizer',
                                                            topic_names=model.topic_names[1:], tau = -tau))
    return model

In [50]:
import copy

def topic_model_predict(
    class_ids,
    prediction_form,
    tau,
    num_collection_passes=40
):
    prediction_form = copy.deepcopy(prediction_form)
    for f_name in files_:
        dictionary = dictionaries[f_name]
        batch_vectorizer = batch_vectorizers[f_name]
        n_topics = prediction_form[f_name].keys()
        #print(n_topics)
        
        for n_ in n_topics:
            model = topic_model(class_ids, dictionary, num_of_topics=int(n_), tau=tau, tf=2)
            model.fit_offline(batch_vectorizer=batch_vectorizer,num_collection_passes=num_collection_passes)
            X = model.get_theta().to_numpy()[1:].T
            prediction_form[f_name][n_] = {
                'theta': X,
                'y_pred': X.argmax(axis=1)
            }
            
    return prediction_form

In [51]:
def print_arr(arr):
  print('Printing the array')
  for i in range(0,len(arr)):
    print(arr[i])

In [52]:
def topic_model_evaluate(
    class_ids,
    prediction_form,
    tau,
    predictor=topic_model_predict,
    verbose=False,
    results_return=False
):
    metrics = []
    n_groups = 0
    
    y_preds = predictor(class_ids,prediction_form,tau)
    
    iterator_ = toloka.groupby(['worker_id', 'text_group_id'])
    if verbose:
        iterator_ = tqdm(iterator_)
    for (w_id, t_g_id), doc_grouped in iterator_:
        # Take predicted vector
        # Filter documents, wich haven't been marked
        n_topics = doc_grouped['mark_wo_trash'].max()
        y_pred = np.array(y_preds[t_g_id][n_topics]['y_pred'])[lost_idx_in_sorted[(w_id, t_g_id)]]
        y_true = doc_grouped['mark'].values
        metrics.append(get_report(y_true, y_pred))
        
        # Evaluate if we correct predict back topic and not polarized docs
        not_pol_pred, not_pol_true = y_pred == 1, doc_grouped['not_pol_flag'].values
        not_in_class_pred, not_in_class_true = y_pred == 2, doc_grouped['not_in_class'].values
        metrics[-1].update({
            'not_pol_precision': precision_score(not_pol_pred, not_pol_true),
            'not_pol_recall': recall_score(not_pol_pred, not_pol_true),
            'not_pol_F1': f1_score(not_pol_pred, not_pol_true),
            'not_in_class_precision': precision_score(not_in_class_pred, not_in_class_true),
            'not_in_class_recall': recall_score(not_in_class_pred, not_in_class_true),
            'not_in_class_F1': f1_score(not_in_class_pred, not_in_class_true)
        })
        
        n_groups+=1
        
    mean_metrics = {}
    for k in metrics[0].keys():
        k_res = [value[k] for value in metrics]
        #print_arr(k_res)
        mean_metrics[k] = sum(k_res) / n_groups
    return_ = {'metrics': mean_metrics}
    if results_return:
        return_.update({'y_preds':y_preds})

    return return_
        

### Adjust tau when all modalities on:

In [53]:
RESULTS = []

In [54]:
class_ids = {
    'neg_pol': 1,
    'pos_pol': 1,
    'fillmore': 1,
    'subjects': 1,
    'objects': 1,
}
results = {'name': 'all_modalities',
           'metrics':[]
          }
for i in tqdm(np.arange(0.1, 3, 0.1), position=0, leave=True):
    results['metrics'].append(topic_model_evaluate(
        class_ids,
        prediction_form,
        i,
        predictor=topic_model_predict,
#         verbose=True,
        results_return=True
    ))
    
RESULTS.append(results)

100%|██████████| 29/29 [10:56<00:00, 22.65s/it]


### Defining tau


In [55]:
grand_tau = 1.5

In [56]:
pol_tau = 0.75

In [57]:
roles_tau = 0.5

In [58]:
spo_tau = 1

### Find balance

#### Neg/pos

In [59]:
neg_part = 0.85

#### Subj/obj

In [60]:
spo_tau = 1

In [61]:
subj_part = 0.85

#### SPO/Roles/Sent

In [62]:
spo_part = 0.6
roles_part = 0.2
pol_part = 0.2

#### Old/new features

In [63]:
old_part = 0.6

In [64]:
CLASS_IDS = {
    'neg_pol': old_part*2*pol_tau*neg_part*pol_part/grand_tau,
    'pos_pol': old_part*2*pol_tau*(1-neg_part)*pol_part/grand_tau,
    'fillmore': old_part*roles_tau*roles_part/grand_tau,
    'subjects': old_part*2*spo_tau*subj_part*spo_part/grand_tau,
    'objects': old_part*2*spo_tau*(1-subj_part)*spo_part/grand_tau,
}

CLASS_IDS

{'fillmore': 0.04,
 'neg_pol': 0.102,
 'objects': 0.07200000000000001,
 'pos_pol': 0.018000000000000002,
 'subjects': 0.408}

## Modeling with semantic dist

### Prepare different types of similarities

### Find optimal balance btw different types of vectors

In [65]:
mean_sims = {}
min_sims = {}
# 'add_emb'
text_sims = {}
title_sims = {}
tf_idf_sims = {}


for f_name in tqdm(files_):
    sub_ds = sorted(filter(lambda x: x['text_group_id'] == f_name, input_ds), key=lambda x: int(x['doc_id']))
    
    text_emb = np.stack([doc['add_emb']['text_embedding'] for doc in sub_ds])
    title_emb = np.stack([doc['add_emb']['title_embedding'] for doc in sub_ds])
    
    text_sims[f_name] = cosine_similarity(text_emb)
    title_sims[f_name] = cosine_similarity(title_emb)
    
    sent_emb = [doc['sent_embeddings'] for doc in sub_ds]
    # Create sent_vectors
    l = len(sub_ds)
    mean_sim = np.zeros((l, l))
    min_sim = np.zeros((l, l))
    
    for (emb_1, emb_2), (i, j) in zip(combinations(sent_emb, 2), list(combinations(list(range(l)), 2))):
        dist = cosine_similarity(emb_1, emb_2)
        mean_sim[i][j] = dist.mean()
        min_sim[i][j] = dist.min()
    
    mean_sims[f_name] = mean_sim.T + mean_sim
    min_sims[f_name] = min_sim.T + min_sim    
    
    vectorizer = TfidfVectorizer(stop_words = stop_words)
    tf_idf_data = vectorizer.fit_transform([doc['lemmatized_text'] for doc in sub_ds])
    tf_idf_sims[f_name] = cosine_similarity(tf_idf_data.toarray())

100%|██████████| 30/30 [00:09<00:00,  3.04it/s]


In [66]:
import copy

classes_in_group = toloka.groupby(['text_group_id', 'worker_id'])['mark'].max().reset_index()
prediction_form_2 = {}
for (gr_id), grouped in classes_in_group.groupby('text_group_id'):
    prediction_form_2[gr_id] = {}
    for m in grouped['mark'].unique():
        prediction_form_2[gr_id][m] = None

def cluster_model_predict(
    X,
    prediction_form,
    topic_m_addition = None,
    topic_part = None #Only when topic_m_addition passed works
):
    prediction_form = copy.deepcopy(prediction_form)
    for f_name in files_:
        n_topics = prediction_form[f_name].keys()
        
        for n_ in n_topics:
            y_pred = KMeans(n_clusters=int(n_)).fit_predict(X[f_name])
            prediction_form[f_name][n_] = {'y_pred': y_pred}
            
    return prediction_form

def cluster_model_evaluate(
    X,
    prediction_form,
    predictor=cluster_model_predict,
    verbose=False,
    results_return=False
):
    metrics = []
    n_groups = 0
    
    y_preds = predictor(X, prediction_form)
    
    iterator_ = toloka.groupby(['worker_id', 'text_group_id'])
    if verbose:
        iterator_ = tqdm(iterator_)
    for (w_id, t_g_id), doc_grouped in iterator_:
        # Take predicted vector
        # Filter documents, wich haven't been marked
        n_topics = doc_grouped['mark'].max()
        y_pred = np.array(y_preds[t_g_id][n_topics]['y_pred'])[lost_idx_in_sorted[(w_id, t_g_id)]]
        y_true = doc_grouped['mark'].values
        metrics.append(get_report(y_true, y_pred))
        
        n_groups+=1
        
    mean_metrics = {}
    for k in metrics[0].keys():
        k_res = [value[k] for value in metrics]
        #print(k_res)
        mean_metrics[k] = sum(k_res) / n_groups
    return_ = {'metrics': mean_metrics}
    if results_return:
        return_.update({'y_preds':y_preds})
    return return_
       
    
def get_mean_report(n_exp, X):
    reports = []
    for i in range(n_exp):
        reports.append(
            cluster_model_evaluate(
                X,
                prediction_form_2,
                predictor=cluster_model_predict,
                verbose=False,
                results_return=False
            )['metrics']
        )
    mean_report = dict((k, None) for k in reports[0].keys())
    for k in reports[0].keys():
        k_res = [report[k] for report in reports]
        mean_report[k] = sum(k_res) / n_exp
    return mean_report

def dist_experiment(dist_1, dist_2, max_n_exps):
    results = []
    tiks = np.arange(0, 1.05, 0.05)
    for i in tqdm(tiks):
        dist = dict((f,i * dist_1[f] + (1-i) * dist_2[f]) for f in files_)
        results.append(get_mean_report(max_n_exps, dist))
    return results, tiks

#### Mean/min

In [67]:
mean_part = 0.8
mean_min_sims = dict((f,mean_part * 2 * mean_sims[f] + (1-mean_part) * 2 * min_sims[f]) for f in files_)

#### Title/text

In [68]:
title_part = 0.2
title_text_sims = dict((f,title_part * 2 * title_sims[f] + (1-title_part) * 2 * text_sims[f]) for f in files_)

#### Global balance in dist

In [69]:
mean_min_part = 0.1
title_text_part = 0.7
tf_idf_part = 0.2

total_dist = dict(
            (f,mean_min_part * 3 * mean_min_sims[f] + \
             title_text_part * 3 * title_text_sims[f] + \
             tf_idf_part * 3 * tf_idf_sims[f]
            ) 
            for f in files_
)

#### Total 

In [70]:
t_model_prediction = topic_model_predict(
    CLASS_IDS,
    prediction_form,
    grand_tau
)
topic_sims = {}
for f_name in files_:
        n_topics = t_model_prediction[f_name].keys()
        cos_sim = 0
        for n_ in n_topics:
            X = t_model_prediction[f_name][n_]['theta']
            cos_sim += cosine_similarity(X)
        topic_sims[f_name] = cos_sim / len(n_topics)

In [71]:
total_results, tiks = dist_experiment(topic_sims, total_dist, max_n_exps=50)

RESULTS.append({
    'name': 'total_balance',
    'title_part': tiks,
    'metrics': total_results
})

100%|██████████| 21/21 [21:57<00:00, 62.75s/it]


In [72]:
with open('GLOBAL_RESULTS.pickle', 'wb') as f:
    dill.dump(RESULTS, f)

In [73]:
with open('/content/GLOBAL_RESULTS.pickle', "rb") as file:
    data_new = pickle.load(file)
print_arr(data_new[0])

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.11593436, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.10099502,
        0.        , 0.        ],
       [0