In [24]:
import numpy.linalg as LA
import yaml
config = yaml.load(open("config.yml"), Loader=yaml.FullLoader)

In [3]:
cosine_function = lambda a, b: np.round(np.inner(a, b) / (LA.norm(a) * LA.norm(b)), 3)

In [7]:
def embed_with_siamese_bert(model, sentences):
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings

In [5]:
def compute_PCA(embeddings):
    t1 = time()
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)
    pca = PCA(n_components=3)
    embeddings_PCA = pca.fit_transform(scaled_embeddings)
    print("Final shape : ",embeddings_PCA.shape)
    print("time : ",(time() - t1))
    return embeddings_PCA

In [6]:
def PCA_mle(embeddings):
    if len(embeddings) < len(embeddings[0]):
        return None

    t1 = time()
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)
    pca = decomposition.PCA(n_components='mle')
    x_red = pca.fit_transform(scaled_embeddings)
    print("PCA mle shape : ", x_red.shape)
    print("time : ",(time() - t1))
    return x_red

In [11]:
def train_tars_classifier(train_utterance_tags, config):
    data = pd.DataFrame.from_dict({'v1' : list(train_utterance_tags.keys()), 'v2' : list(train_utterance_tags.values())})
    data = data[['v1', 'v2']].rename(columns={'v1':'text', 'v2':'label'})
    data['label'] = '__label__' + data['label'].astype(str)

    data.iloc[0:int(len(data) * 0.8)].to_csv("./data_files/tars/" + 'train.csv', sep='\t', index=False, header=False)
    data.iloc[int(len(data) * 0.8):int(len(data) * 0.9)].to_csv("./data_files/tars/" + 'test.csv', sep='\t', index=False, header=False)
    data.iloc[int(len(data) * 0.9):].to_csv("./data_files/tars/" + 'dev.csv', sep='\t', index=False, header=False)

    column_name_map = {0: "text", 1: "label"}
    train_corpus: Corpus = CSVClassificationCorpus("./data_files/tars/",
                                             column_name_map,
                                             skip_header=True,
                                             delimiter='\t')

    label_dictionary = train_corpus.make_label_dictionary()

    tars = TARSClassifier.load('tars-base')
    tars.add_and_switch_to_new_task("dim", label_dictionary=label_dictionary)
    trainer = ModelTrainer(tars, train_corpus)

    trainer.train(base_path="./data_files/tars/", # path to store the model artifacts
                  learning_rate=0.02, # use very small learning rate
                  mini_batch_size=1, # small mini-batch size since corpus is tiny
                  max_epochs=20, # terminate after 20 epochs
                  train_with_dev=False,
                  )


In [2]:
def embed_utterances(utterances_to_labels, tars_train_dataset, embed_model, config):
    utterances = list(utterances_to_labels.keys())
    labels = list(utterances_to_labels.values())
    train_tars_classifier(tars_train_dataset, config)

    snippet_embeddings_array = []
    batch_size = config['siambert_batch_size']
    snippet_batches = [utterances[i * batch_size:(i + 1) * batch_size] for i in
                       range((len(utterances) + batch_size - 1) // batch_size)]

    for snippet_batch in snippet_batches:
        snippet_embeddings_batch = embed_with_siamese_bert(embed_model, snippet_batch)
        snippet_embeddings_array.extend(snippet_embeddings_batch)
    projection_embeddings = reduce_dim(snippet_embeddings_array, config["dim_reduction_algo"])

    ids2embeds, ids2prems, ids2projection_embeds, prems2ids, tars_ids2labels = {}, {}, {}, {}, {}
    for idx, embed in enumerate(snippet_embeddings_array):
        ids2embeds[idx+1] = embed
        ids2projection_embeds[idx+1] = projection_embeddings[idx]
        ids2prems[idx+1] = utterances[idx]
        prems2ids[utterances[idx]] = idx + 1
        tars_ids2labels[idx+1] = labels[idx]

    with open('./data_files/misc/ids2embeds.pickle', 'wb') as handle:
        pickle.dump(ids2embeds, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open('./data_files/misc/ids2projection_embeds.pickle', 'wb') as handle:
        pickle.dump(ids2projection_embeds, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    with open('./data_files/misc/ids2prems.pickle', 'wb') as handle:
        pickle.dump(ids2prems, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open('./data_files/misc/prems2ids.pickle', 'wb') as handle:
        pickle.dump(prems2ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open('./data_files/misc/ids2labels.pickle', 'wb') as handle:
        pickle.dump(tars_ids2labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return ids2prems, ids2embeds, ids2projection_embeds

In [8]:
def cluster_unsupervised(ids2embeds):
    clusters = {}
    premise_ids = list(ids2embeds.keys())
    x_red = list(ids2embeds.values())

    gmm = GaussianMixture(n_components=66, covariance_type='full')
    gmm.fit(x_red)
    labels = gmm.predict(x_red)
    metrics.silhouette_score(x_red, labels, metric='euclidean')

    all_cluster_labels, all_cluster_centers = [], []
    cluster_label_centers = {}
    centers = np.empty(shape=(gmm.n_components, x_red[0].shape[0]))
    for i in range(gmm.n_components):
        density = scipy.stats.multivariate_normal(mean=gmm.means_[i], cov=gmm.covariances_[i]).logpdf(x_red)
        centers[i, :] = x_red[np.argmax(density)]
        all_cluster_labels.append(i)
        all_cluster_centers.append(centers[i, :])
        cluster_label_centers[i] = centers[i, :]

    for i in range(gmm.n_components):
        cluster_sample_indices = [idx for idx, label in enumerate(labels) if label == i]
        X_samples = []
        sample_ids = []
        for sample_index in cluster_sample_indices:
            X_samples.append(x_red[sample_index])
            sample_ids.append(premise_ids[sample_index])

        print('X_samples shape : {}\n'.format(len(X_samples)))

        sample_distances = {}
        for idx, sample in enumerate(X_samples):
            sample_premise_id = sample_ids[idx]
            if len(X_samples) < 2:
                d = 0
            else:
                d = (1 - cosine_function(sample.reshape(1, -1), centers[i]))[0]
            sample_distances[sample_premise_id] = d
        sorted_sample_distances = {k: v for k, v in sorted(sample_distances.items(), key=lambda item: item[1])}
        clusters[i] = sorted_sample_distances

    with open('./data_files/misc/clusters.pickle', 'wb') as handle:
        pickle.dump(clusters, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return clusters

In [9]:
def load_tars_classifier_miscellaneous(config):
    model_dir = os.path.join("./data_files/misc/", 'best-model.pt')
    tars = TARSClassifier.load(model_dir)
    return tars

In [10]:
def tag_dimension_to_clusters_miscellaneous(tars, clusters, ids2prems, threshold):
    dimension_tagged_clusters = {}
    for label, snippet_score_dict in clusters.items():
        snippet_dimension_tags = {}
        snippet_ids = []
        snippet_texts = []
        for snippet_id, dist in list(snippet_score_dict.items()):
            if dist > threshold:
                break
            snippet_ids.append(snippet_id)
            snippet_texts.append(ids2prems[snippet_id])

        dimensions = flair_classify(snippet_texts, tars)

        for i, dimension in enumerate(dimensions):
            snippet_id = snippet_ids[i]
            snippet_text = snippet_texts[i]
            snippet_dimension_tags[snippet_id] = (snippet_text, dimension)
        dimension_tagged_clusters[label] = snippet_dimension_tags

    tagged_clusters_output = {}
    for cluster_label, snippet_dimension_dict in dimension_tagged_clusters.items():
        snippet_dimension_tags = []
        dimension_frequency = {}
        snippet_texts = []
        sorted_snippet_dimensions = list(sorted(snippet_dimension_dict.items(), key=lambda x: x[1][1], reverse=True))
        for snippet_id, text_to_dimension_tup in sorted_snippet_dimensions:
            text = ids2prems[snippet_id].strip()

            if text in snippet_texts:
                continue

            snippet_texts.append(text)
            snippet_dimension_tags.append({snippet_id: (text_to_dimension_tup[1][0],
                                                                      text_to_dimension_tup[1][1])})

            dimension = text_to_dimension_tup[1][0]
            score = text_to_dimension_tup[1][1]
            if dimension not in dimension_frequency:
                dimension_frequency[dimension] = 0
            else:
                dimension_frequency[dimension] += 1

            with open('dimension_tagged_clusters.tsv', 'a') as f:
                f.write('{}\t{}\t{}\t{}\t{}\n'.format(cluster_label, snippet_id, text_to_dimension_tup[0], text_to_dimension_tup[1][0],
                                                  text_to_dimension_tup[1][1]))

        tagged_clusters_output[str(cluster_label)] = {'dimension_frequency' : {k : v for k, v in
                                                                               sorted(dimension_frequency.items(),
                                                                                key= lambda x: x[1], reverse=True)},
                                                      'snippet_dimension_tags' : snippet_dimension_tags}

    with open('./data_files/misc/tagged_clusters_output.pickle', 'wb') as handle:
        pickle.dump(tagged_clusters_output, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return tagged_clusters_output