In [None]:

from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

#grid search to find optimal parameters
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import GridSearchCV

# Dimension reduction and clustering libraries
import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [None]:
tqdm.pandas()
russian_stopwords = stopwords.words("russian")

In [None]:
def remove_target_word(context, positions):
    positions = [i.split('-') for i in positions.split(',')]
    for position in positions:
        start = int(position[0])
        end = int(position[1])
        return context.replace(context[start:end], '')

In [None]:
def elmo_preprocess(text):
    result = []
    for sentence in text.split('.'):
        tokens = [re.sub('[^А-Яа-я.!? ]', ' ', token) for token in word_tokenize(sentence) if token not in russian_stopwords\
              and token != " " and token != "" \
              and token.strip() not in punctuation]
        result.append(tokens)
    return [x for x in result if x]

In [None]:
def bert_preprocess(text):
    result = []
    for sentence in text.split('.'):
        tokens = [re.sub('[^А-Яа-я.!? ]', ' ', token) for token in word_tokenize(sentence) if token not in russian_stopwords\
              and token != " " and token != "" \
              and token.strip() not in punctuation]
        sentence = ' '.join(tokens)
        result.append(sentence)
    return [x for x in result if x]

In [None]:
#загружаем модель ELMO
elmo = ELMoEmbedder("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz",\
                    elmo_output_names=["elmo"])

In [None]:
#загружаем конфиги руберта
bert_config = read_json(configs.embedder.bert_embedder)
bert_config['metadata']['download'][0]['url'] = 'http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_pt.tar.gz'
bert_config['metadata']['variables']['BERT_PATH'] = '{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_pt'

In [None]:
#загружаем модель BERT
bert = build_model(bert_config, download=True)

In [None]:
def extract_token_vectors(bert_context_vectors, context, positions):
    words = []
    vectors = []
    positions = [i.split('-') for i in positions.split(',')]
    for position in positions:
        start = int(position[0])
        end = int(position[1])
        words.append(context[start:end+1])
    for word in words:
        for list_idx, token_list in enumerate(bert_context_vectors[0]):
            if word in token_list:
                word_idx = token_list.index(word)
                bert_token_vector = bert_context_vectors[1][list_idx][word_idx]
                vectors.append(bert_token_vector)
    try:
        result = np.vstack(vectors)
    except ValueError:
        result = None
    return result

In [None]:
def elmo_vectorize(elmo_preprocessed_text):
    return elmo(elmo_preprocessed_text)

def bert_vectorize(bert_preprocessed_text):
    return(bert(bert_preprocessed_text))

In [None]:
def average_pooler_outputs_mean(vectors):
        """ Function to average BERT pooler outputs """
    average_vector = np.array([sum(subvector) / len(subvector) for subvector in vectors[6].transpose()])
    return average_vector

In [None]:
def stack_vectors(bert_embeddings):
    vectors = []
    for i, vector in enumerate(bert_embeddings):
        vectors.append(vector[0])
        
    X = np.vstack(vectors)
    return X

In [None]:
reducer = umap.UMAP()
standard_embedding = reducer.fit_transform(X)
standard_embedding.shape

In [None]:
c = [sns.color_palette()[x] for x in X_gold_senses]

In [None]:
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=[sns.color_palette()[x] for x in X_gold_senses], cmap='Spectral')

# plt.title("UMAP projection of BERT embeddings of word 'белок'", fontsize=24);

In [None]:
def find_parameters(X):
    """ Function to find optimal parameters for Affinity propagation algorithm"""
    S =-pairwise_distances(X,metric=affinity, squared=True)
    param_grid   =  np.unique(map(int, np.linspace(np.min(S), np.median(S), 30)))
    search = GridSearchCV(AffinityPropagation(), param_grid, verbose=0) 
    return search.fit(X)

In [None]:
def cluster_via_affinity_prop(matrix, gold_senses=list_of_dom_gold_senses, preference=preference):
    """ Clustering via Affinity propagation algorithm """
    af = AffinityPropagation(preference=preference).fit(matrix)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(gold_senses, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(gold_senses, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(gold_senses, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(gold_senses, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(gold_senses, labels))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(matrix, labels, metric='sqeuclidean'))

In [1]:
def cluster_word_senses(df, preference):

    predicted = []
    gold_senses = []
    for query in df.word.unique():
        print('Now analyzing', query, '...')
        df_word = df[df['word'] == query]
        vectors = []
        for i, vector in enumerate(df_word.target_words_bert_embs):
            vectors.append(vector[0])
        X = np.vstack(vectors)
        print('emb shape: ',X.shape)
        X_gold_senses = [int(i) for i in df_word.gold_sense_id]
        min_gold = min(X_gold_senses)
        X_gold_senses = [i - min_gold for i in X_gold_senses]
        print('gold', X_gold_senses)
        print('len gold_senses: ', len(X_gold_senses))
        gold_senses += X_gold_senses
        gold_senses = [str(i) for i in gold_senses]
        
        clustering = AffinityPropagation(preference=preference).fit(X)
        cur_predicted = clustering.labels_.tolist()
        predicted += cur_predicted
        predicted = [str(i) for i in predicted]
    
    df.predict_sense_id = predicted
    df.gold_senses = gold_senses
    return df

In [None]:
def ari_per_word_weighted(df):
    """ Function for final evaluation. """
    
    words = {word: (adjusted_rand_score(df_word.gold_sense_id, df_word.predict_sense_id), len(df_word))
             for word in df.word.unique()
             for df_word in (df.loc[df['word'] == word],)}
    
    print(words)

    cumsum = sum(ari * count for ari, count in words.values())
    total  = sum(count for _, count in words.values())
    
    print(cumsum, total)

    assert total == len(df), 

    return cumsum / total, words