# Training and Evaluation of the Predicates Clustering Service using SciBERT Embeddings
Using this notebook you can simply build either **K-Means** or **Agglomerative** clusters using pre-computed SciBERT embeddings of your dataset and then evaluate your trained models on your test set.

Please store you data files in a Google Drive directory of yours(``MAIN_DRIVE_DIR``) and then provide their paths and the path (``RESULTS_PATH``) of where the results must be stored in that directory. Also please provide the name of your Google Storage Cloud bucket so that the models can be uploaded to and downloaded from it.


|       Variable       | Description |
|:--------------------:|:--------------------------------------------------------:|
|``MAIN_DRIVE_DIR`` | Name of your main directory in your Google Drive |
|  `TRAINING_SET_PATH` | Path to your training set inside the `MAIN_DRIVE_DIR `|
|  `TEST_SET_PATH` | Path to your test set inside the `MAIN_DRIVE_DIR `|
|  `DATA_PATH` | Path to your dataset inside the ``MAIN_DRIVE_DIR`` |
|  `SCIBERT_TRAINING_REPRESENTATION` | Path to your training set representations the ``MAIN_DRIVE_DIR``. This should be the output file of the notebook ``SciBERT_embeddings``|
|  `SCIBERT_TEST_REPRESENTATION` | Path to your test set representations the ``MAIN_DRIVE_DIR``. This should be the output file of the notebook ``SciBERT_embeddings``|
|  `RESULTS_PATH` | Path to the generated results file inside the `MAIN_DRIVE_DIR `|
|  `CLUSTERING_METHOD` | `kmeans` or `agglomerative`|
| `VECTORIZER_PATH` | Name of the pickle file of your vectorizer |
| `MODEL_TEMPLATE_PATH` | Template name of the model files which will be stored in your Google Cloud Storage. The default name has the schema `scibert_{clustering method}_{k}.pkl` |
| `BUCKET` | The name of your Google Cloud Storage bucket |





In [None]:
# Constants
TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
DATA_PATH = './dataset.json'
MAIN_DRIVE_DIR = 'TODO'
SCIBERT_TRAINING_REPRESENTATION = './scibert_training_representations.npz'
SCIBERT_TEST_REPRESENTATION = './scibert_test_representations.npz'
VECTORIZER_PATH = './scibertvectorizer.pkl'
CLUSTERING_METHOD = 'kmeans' # or 'agglomerative'
RESULTS_PATH = './scibert_{}_results.json'.format(CLUSTERING_METHOD)
BUCKET = 'TODO'
MODEL_TEMPLATE_PATH = 'scibert_{}_{}.pkl'.format(CLUSTERING_METHOD, '{}')

In [None]:
import json
import pickle
import math

def round_to_next_hundred(x):
  return int(math.ceil(x / 100.0)) * 100


def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

def read_pickle(input_path):
    with open(input_path, 'rb') as f:
        loaded_object = pickle.load(f)
    return loaded_object

def write_json(json_data, output_path):
    with open(output_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

def write_pickle(data, output_path):
    with open(output_path, 'wb') as f:
        pickle.dump(data, f)  

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TRAINING_SET_PATH $TRAINING_SET_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TEST_SET_PATH $TEST_SET_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$DATA_PATH $DATA_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$DATA_PATH $DATA_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/scibert/'$SCIBERT_TRAINING_REPRESENTATION $SCIBERT_TRAINING_REPRESENTATION
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/scibert/'$SCIBERT_TEST_REPRESENTATION $SCIBERT_TEST_REPRESENTATION

# Training

In [None]:
# Import and process the training data
import pandas as pd
from scipy import sparse

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])
train_df_transformed = sparse.load_npz(SCIBERT_TRAINING_REPRESENTATION)

n_comparisons = round_to_next_hundred(train_df.comparison_id.unique().shape[0])
n_papers = round_to_next_hundred(train_df.paper_id.unique().shape[0])
N_CLUSTERS = range(n_comparisons, n_papers, 50)
print(list(N_CLUSTERS))

# we need to build the clusters on the complete dataset, since the "prediction" in hierarchical clusterings requires re-building the clusters.
if CLUSTERING_METHOD == 'agglomerative':
  test_df_transformed = sparse.load_npz(SCIBERT_TEST_REPRESENTATION)
  train_df_transformed = sparse.vstack((train_df_transformed, test_df_transformed))

train_df_transformed.shape

In [None]:
#checking for optimal number of clusters
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering
from time import time
from google.colab import auth
auth.authenticate_user()
  
for n in N_CLUSTERS:
    t0 = time()
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(n)
    print(MODEL_PATH)

    if CLUSTERING_METHOD == 'kmeans':
      clustering_model = KMeans(n_clusters=n, random_state=212)
      clustering_model = clustering_model.fit(train_df_transformed)
    elif CLUSTERING_METHOD == 'agglomerative':
      clustering_model = AgglomerativeClustering(n_clusters=n, linkage='ward')
      clustering_model = clustering_model.fit(train_df_transformed.toarray())
    
    print('{0:2f}'.format(time() - t0))
    write_pickle(clustering_model, MODEL_PATH)
    
    # Upload model to bucket
    !gsutil cp {MODEL_PATH} gs://{BUCKET}

# Evaluation

## Vectorizer

In [None]:
!pip install sentence-transformers

In [None]:
from torch.utils.data import Dataset

class ClusteringDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data.index)

    def __getitem__(self, idx):
        instances = self.data['text']
        
        return '[CLS] ' + instances.iloc[[idx]].values[0] + ' [SEP]'

In [None]:
from sentence_transformers import SentenceTransformer

vectorizer = SentenceTransformer('allenai/scibert_scivocab_uncased')

## Evaluation Functions

In [None]:
import numpy as np

def predict_comparisons(clustering_model, test_element, test_element_index, train_df):
    if CLUSTERING_METHOD == 'kmeans':
      cluster_label = clustering_model.predict(test_element)
      cluster_instances_indices = np.argwhere(clustering_model.labels_ == cluster_label).squeeze(1)
    elif CLUSTERING_METHOD == 'agglomerative':
      cluster_label = clustering_model.labels_[train_df.shape[0] + test_element_index]
      cluster_instances_indices = np.argwhere(clustering_model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)

    cluster_instances = train_df.iloc[cluster_instances_indices]
    comparison_ids = cluster_instances['comparison_id'].unique()
    return comparison_ids


def map_to_predicates(data, comparison_ids):
    predicate_ids = []
    
    for comparison in data['comparisons']:
      if comparison['id'] in comparison_ids:

        for predicate in comparison['predicates']:
          if predicate['id'] in predicate_ids:
            continue

          predicate_ids.append(predicate['id'])

    return predicate_ids

def evaluate_macro(expected, predicted):
    return compute_metrics(evaluate_micro(expected, predicted))

def evaluate_micro(expected, predicted):
    """
    tp: correctly predicted properties --> found in expected and predicted sets
    fp: incorrectly predicted properties --> found only in predicted set
    fn: incorrectly predicted properties for other classes -> found only in expected set
    """
    tp = len(set(expected).intersection(predicted))
    fp = len(set(predicted).difference(expected))
    fn = len(set(expected).difference(predicted))
    
    return np.array([tp, fp, fn])


def compute_metrics(confusion_results):
    tp, fp, fn = confusion_results
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_measure = 2 * ((precision * recall) / (precision + recall)) 

    return np.array([precision, recall, f_measure])

def f_measure(precision, recall):
  return 2 * ((precision * recall) / (precision + recall))


In [None]:
# Import and process the test data
import pandas as pd 

data = read_json(DATA_PATH)

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

test_json = read_json(TEST_SET_PATH)
test_df = pd.json_normalize(test_json['instances'])

## Evaluation Loop

In [None]:
import os
from scipy import sparse
from google.colab import auth
auth.authenticate_user()

# vectorizer = read_pickle(VECTORIZER_PATH)

results = {}
vectorized_texts = np.empty((0, 768), dtype=np.float32)
for i, k in enumerate([3150]):#N_CLUSTERS):
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(k)
    print('evaluating model: {}'.format(MODEL_PATH))

    if not os.path.exists(MODEL_PATH):
        !gsutil cp gs://$BUCKET/$MODEL_PATH $MODEL_PATH
    clustering_model = read_pickle(MODEL_PATH)

    macro_measures = np.empty((0,3), dtype=np.float32)
    micro_measures = np.zeros(3)
    for test_instance_index, test_instance in test_df.iterrows():
        expected_comparison_id, text = test_instance['comparison_id'], test_instance['text']
        expected = map_to_predicates(data, [expected_comparison_id])

        # transform the texts only once. First iteration takes ~15 minutes
        if i == 0:
            vectorized_text = vectorizer.encode([text])
            vectorized_texts = np.vstack((vectorized_texts, vectorized_text))
        else:
          vectorized_text = vectorized_texts[test_instance_index]

        predicted_comparison_ids = predict_comparisons(clustering_model, vectorized_text.reshape(1, -1), test_instance_index, train_df)
        predicted = map_to_predicates(data, predicted_comparison_ids)

        macro_measures = np.vstack((macro_measures, evaluate_macro(expected, predicted)))
        micro_measures += evaluate_micro(expected, predicted)

    macro_measures = np.nanmean(macro_measures, axis=0)
    micro_measures = compute_metrics(micro_measures)
    results[str(k)] = {
        'k': k,
        'macro': {
            'precision': macro_measures[0],
            'recall': macro_measures[1],
            'f_measure': f_measure(macro_measures[0], macro_measures[1])
        },
        'micro': {
            'precision': micro_measures[0],
            'recall': micro_measures[1],
            'f_measure': micro_measures[2]
        }
    }
    write_json(results, RESULTS_PATH)
    !cp $RESULTS_PATH '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/scibert/'$RESULTS_PATH

In [None]:
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/scibert/'$RESULTS_PATH $RESULTS_PATH

results = read_json(RESULTS_PATH)
results_df = pd.json_normalize(results.values())
results_df

Unnamed: 0,k,macro.precision,macro.recall,macro.f_measure,micro.precision,micro.recall,micro.f_measure
0,400,0.306195,0.839694,0.448752,0.150197,0.816065,0.253701
1,450,0.305653,0.843269,0.448677,0.152694,0.809686,0.256934
2,500,0.320495,0.840201,0.463997,0.171163,0.809835,0.282597
3,550,0.331861,0.840069,0.475772,0.176141,0.809167,0.289306
4,600,0.33533,0.837539,0.478914,0.187117,0.809241,0.303953
5,650,0.352459,0.832684,0.495277,0.202559,0.780687,0.32166
6,700,0.371371,0.8263,0.512434,0.195432,0.781132,0.312643
7,750,0.370641,0.819199,0.510369,0.19521,0.758733,0.310527
8,800,0.369202,0.810297,0.507272,0.215505,0.75458,0.335261
9,850,0.402385,0.811629,0.538029,0.232339,0.759846,0.355865
