In [None]:
# Constants
TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
DATA_PATH = './data.json'
SCIBERT_TRAINING_REPRESENTATION = './scibert_training_representations_corrected_with_special_tokens.npz'
SCIBERT_TEST_REPRESENTATION = './scibert_test_representations_with_special_tokens.npz'
VECTORIZER_PATH = './scibertvectorizer.pkl'
CLUSTERING_METHOD = 'agglomerative' # or 'kmeans'
RESULTS_PATH = './scibert_{}_results_corrected_with_special_tokens.json'.format(CLUSTERING_METHOD)
BUCKET = 'TODO'
MODEL_TEMPLATE_PATH = 'scibert_{}_{}_corrected_with_special_tokens.pkl'.format(CLUSTERING_METHOD, '{}')
BERT_PATH = './scibert_scivocab_uncased'
MAX_SEQUENCE_LENGTH = 512
N_CLUSTERS = range(200, 2100, 50)

In [None]:
import json
import pickle

def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

def read_pickle(input_path):
    with open(input_path, 'rb') as f:
        loaded_object = pickle.load(f)
    return loaded_object

def write_json(json_data, output_path):
    with open(output_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

def write_pickle(data, output_path):
    with open(output_path, 'wb') as f:
        pickle.dump(data, f)  

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

!cp '/content/drive/MyDrive/master_thesis/predicates_clustering/training_set.json' $TRAINING_SET_PATH
!cp '/content/drive/MyDrive/master_thesis/predicates_clustering/test_set.json' $TEST_SET_PATH
!cp '/content/drive/MyDrive/master_thesis/predicates_clustering/data.json' $DATA_PATH
!cp '/content/drive/MyDrive/master_thesis/predicates_clustering/scibert/'$SCIBERT_TRAINING_REPRESENTATION $SCIBERT_TRAINING_REPRESENTATION
!cp '/content/drive/MyDrive/master_thesis/predicates_clustering/scibert/'$SCIBERT_TEST_REPRESENTATION $SCIBERT_TEST_REPRESENTATION

Mounted at /content/drive


# Training

In [None]:
# Import and process the training data
import pandas as pd
from scipy import sparse

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])
train_df_transformed = sparse.load_npz(SCIBERT_TRAINING_REPRESENTATION)

# we need to build the clusters on the complete dataset, since the "prediction" in hierarchical clusterings requires re-building the clusters.
if CLUSTERING_METHOD == 'agglomerative':
  test_df_transformed = sparse.load_npz(SCIBERT_TEST_REPRESENTATION)
  train_df_transformed = sparse.vstack((train_df_transformed, test_df_transformed))

train_df_transformed.shape

(3138, 768)

In [None]:
#checking for optimal number of clusters
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering
from time import time
from google.colab import auth
auth.authenticate_user()
  
for n in N_CLUSTERS:
    t0 = time()
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(n)
    print(MODEL_PATH)

    if CLUSTERING_METHOD == 'kmeans':
      clustering_model = KMeans(n_clusters=n, random_state=212)
      clustering_model = clustering_model.fit(train_df_transformed)
    elif CLUSTERING_METHOD == 'agglomerative':
      clustering_model = AgglomerativeClustering(n_clusters=n, linkage='ward')
      clustering_model = clustering_model.fit(train_df_transformed.toarray())
    
    print('{0:2f}'.format(time() - t0))
    write_pickle(clustering_model, MODEL_PATH)
    
    # Upload model to bucket
    !gsutil cp {MODEL_PATH} gs://{BUCKET}

scibert_kmeans_2000_corrected_with_special_tokens.pkl
2523.820448
Copying file://scibert_kmeans_2000_corrected_with_special_tokens.pkl [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/5.9 MiB.                                      
scibert_kmeans_2050_corrected_with_special_tokens.pkl
2596.449377
Copying file://scibert_kmeans_2050_corrected_with_special_tokens.pkl [Content-Type=application/octet-stream]...
\
Operation completed over 1 objects/6.0 MiB.                                      


# Evaluation

## Vectorizer

In [None]:
!pip install sentence-transformers

In [None]:
from torch.utils.data import Dataset

class ClusteringDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        instances = self.data['text']
        
        return '[CLS] ' + instances.iloc[[idx]].values[0] + ' [SEP]'

In [None]:
from sentence_transformers import SentenceTransformer

vectorizer = SentenceTransformer('allenai/scibert_scivocab_uncased')

Downloading:   0%|          | 0.00/437 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/228k [00:00<?, ?B/s]

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/allenai_scibert_scivocab_uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/allenai_scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (ini

## Evaluation Functions

In [None]:
import numpy as np

def predict_comparisons(clustering_model, test_element, test_element_index, train_df):
    if CLUSTERING_METHOD == 'kmeans':
      cluster_label = clustering_model.predict(test_element)
      cluster_instances_indices = np.argwhere(clustering_model.labels_ == cluster_label).squeeze(1)
    elif CLUSTERING_METHOD == 'agglomerative':
      cluster_label = clustering_model.labels_[train_df.shape[0] + test_element_index]
      cluster_instances_indices = np.argwhere(clustering_model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)

    cluster_instances = train_df.iloc[cluster_instances_indices]
    comparison_ids = cluster_instances['comparison_id'].unique()
    return comparison_ids


def map_to_predicates(data, comparison_ids):
    predicate_ids = []
    
    for comparison in data['comparisons']:
      if comparison['id'] in comparison_ids:

        for predicate in comparison['predicates']:
          if predicate['id'] in predicate_ids:
            continue

          predicate_ids.append(predicate['id'])

    return predicate_ids

def evaluate_macro(expected, predicted):
    return compute_metrics(evaluate_micro(expected, predicted))

def evaluate_micro(expected, predicted):
    """
    tp: correctly predicted properties --> found in expected and predicted sets
    fp: incorrectly predicted properties --> found only in predicted set
    fn: incorrectly predicted properties for other classes -> found only in expected set
    """
    tp = len(set(expected).intersection(predicted))
    fp = len(set(predicted).difference(expected))
    fn = len(set(expected).difference(predicted))
    
    return np.array([tp, fp, fn])


def compute_metrics(confusion_results):
    tp, fp, fn = confusion_results
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_measure = 2 * ((precision * recall) / (precision + recall)) 

    return np.array([precision, recall, f_measure])

def f_measure(precision, recall):
  return 2 * ((precision * recall) / (precision + recall))


In [None]:
# Import and process the test data
import pandas as pd 

data = read_json(DATA_PATH)

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

test_json = read_json(TEST_SET_PATH)
test_df = pd.json_normalize(test_json['instances'])

## Evaluation Loop

In [None]:
import os
from scipy import sparse
from google.colab import auth
auth.authenticate_user()

# vectorizer = read_pickle(VECTORIZER_PATH)

results = {}
vectorized_texts = np.empty((0, 768), dtype=np.float32)
for i, k in enumerate(N_CLUSTERS):
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(k)
    print('evaluating model: {}'.format(MODEL_PATH))

    if not os.path.exists(MODEL_PATH):
        !gsutil cp gs://$BUCKET/$MODEL_PATH $MODEL_PATH
    clustering_model = read_pickle(MODEL_PATH)

    macro_measures = np.empty((0,3), dtype=np.float32)
    micro_measures = np.zeros(3)
    for test_instance_index, test_instance in test_df.iterrows():
        expected_comparison_id, text = test_instance['comparison_id'], test_instance['text']
        expected = map_to_predicates(data, [expected_comparison_id])

        # transform the texts only once. First iteration takes ~15 minutes
        if i == 0:
            vectorized_text = vectorizer.encode([text])
            vectorized_texts = np.vstack((vectorized_texts, vectorized_text))
        else:
          vectorized_text = vectorized_texts[test_instance_index]

        predicted_comparison_ids = predict_comparisons(clustering_model, vectorized_text.reshape(1, -1), test_instance_index, train_df)
        predicted = map_to_predicates(data, predicted_comparison_ids)
        macro_measures = np.vstack((macro_measures, evaluate_macro(expected, predicted)))
        micro_measures += evaluate_micro(expected, predicted)

    macro_measures = np.nanmean(macro_measures, axis=0)
    micro_measures = compute_metrics(micro_measures)
    results[str(k)] = {
        'k': k,
        'macro': {
            'precision': macro_measures[0],
            'recall': macro_measures[1],
            'f_measure': f_measure(macro_measures[0], macro_measures[1])
        },
        'micro': {
            'precision': micro_measures[0],
            'recall': micro_measures[1],
            'f_measure': micro_measures[2]
        }
    }
    write_json(results, RESULTS_PATH)
    !cp $RESULTS_PATH '/content/drive/MyDrive/master_thesis/predicates_clustering/scibert/'$RESULTS_PATH

In [None]:
!cp '/content/drive/MyDrive/master_thesis/predicates_clustering/scibert/'$RESULTS_PATH $RESULTS_PATH

results = read_json(RESULTS_PATH)
results_df = pd.json_normalize(results.values())
results_df

Unnamed: 0,k,macro.precision,macro.recall,macro.f_measure,micro.precision,micro.recall,micro.f_measure
0,200,0.309229,0.030849,0.056101,0.198063,0.032203,0.055398
1,250,0.353597,0.030756,0.056591,0.264741,0.032131,0.057307
2,300,0.369355,0.030756,0.056784,0.293272,0.032131,0.057917
3,350,0.389556,0.030424,0.05644,0.312281,0.031845,0.057796
4,400,0.441462,0.02793,0.052537,0.375562,0.029913,0.055412
5,450,0.478539,0.027099,0.051293,0.419816,0.029412,0.054972
6,500,0.494031,0.026268,0.049883,0.424917,0.027336,0.051368
7,550,0.498647,0.026268,0.049906,0.434585,0.027336,0.051437
8,600,0.498647,0.026268,0.049906,0.434585,0.027336,0.051437
9,650,0.498647,0.026268,0.049906,0.434585,0.027336,0.051437


In [None]:
import matplotlib.pyplot as plt
plt.plot(results_df['k'], results_df['interia'], 'bx-')

plt.xlabel('k')
plt.ylabel('sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()