# Cluster Analysis

Using this notebook you can analyise one specific cluster built for the **Predicates Clustering Service** for the **Open Research Knowledge Graph**.

Please search for "TODO"s and do them :)  

# Constants and Utils

In [None]:
VECTORIZATION_METHOD = 'scibert'
CLUSTERING_METHOD = 'kmeans'
k = '3150'
MODEL_PATH = '{}_{}_{}.pkl'.format(VECTORIZATION_METHOD, CLUSTERING_METHOD, k)

TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
DATA_PATH = './dataset.json'
MAIN_DRIVE_DIR = '' # TODO: fill in the directory name in your Google Drive where you have your data files

BUCKET = '' # TODO: fill in your Google Cloud Storage bucket's name

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TRAINING_SET_PATH $TRAINING_SET_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TEST_SET_PATH $TEST_SET_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$DATA_PATH $DATA_PATH

In [None]:
import pickle
import json

def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

def read_pickle(input_path):
    with open(input_path, 'rb') as f:
        loaded_object = pickle.load(f)
    return loaded_object

In [None]:
from google.colab import auth
auth.authenticate_user()

!gsutil cp gs://$BUCKET/$MODEL_PATH $MODEL_PATH

In [None]:
model = read_pickle(MODEL_PATH)

In [None]:
# min, max, avg papers per cluster
import numpy as np 
import pandas as pd

unique, counts = np.unique(model.labels_, return_counts=True)
print(np.min(counts))
print(np.max(counts))
print(np.average(counts))

In [None]:
# min, max, avg comparisons per cluster
train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

if CLUSTERING_METHOD == 'agglomerative':
  test_json = read_json(TEST_SET_PATH)
  test_df = pd.json_normalize(test_json['instances'])
  train_df = pd.concat([train_df, test_df])

try:
  train_df.insert(1, 'cluster_id', model.labels_)
except:
  print('already inserted!')

clusters_comparisons = train_df[['cluster_id', 'comparison_id']].drop_duplicates()
unique, counts = np.unique(clusters_comparisons['cluster_id'], return_counts=True)
print(np.min(counts))
print(np.max(counts))
print(np.average(counts))

In [None]:
# find out how the comparisons are distributed over clusters and how much pure is the distribution
train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

if CLUSTERING_METHOD == 'agglomerative':
  test_json = read_json(TEST_SET_PATH)
  test_df = pd.json_normalize(test_json['instances'])
  train_df = pd.concat([train_df, test_df])

puriteis = []
number_of_clusters = []
weights = []
for comparison_id, number_of_papers in train_df['comparison_id'].value_counts().items():
  paper_indices = train_df[train_df['comparison_id'] == comparison_id].index
  clusters_labels = model.labels_[paper_indices]
  clusters_comparisons = []
  pure_clusters = 0

  for cluster_label in np.unique(clusters_labels):
    cluster_instances_indices = np.argwhere(model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)
    cluster_instances = train_df.iloc[cluster_instances_indices]
    cluster_instances = cluster_instances.drop_duplicates(subset='paper_id')
    cluster_comparisons = cluster_instances['comparison_id'].unique()
    clusters_comparisons.extend(cluster_comparisons)
    if len(cluster_comparisons) == 1:
      pure_clusters += 1

  purity = pure_clusters / len(np.unique(clusters_labels))
  puriteis.append(purity)
  weights.append(number_of_papers)
  number_of_clusters.append(len(np.unique(clusters_labels)))
  print('comparison {} with {} papers is distributed over {} clusters containing {} comparisons, where {} clusters are pure. - Purity={}'.format(comparison_id, number_of_papers, len(np.unique(clusters_labels)), len(set(clusters_comparisons)), pure_clusters, purity))
  print('comparisons: {}'.format(set(clusters_comparisons)))

print('Weighted average purity: {:.3f}'.format(np.average(puriteis, weights=weights)))
print('min clusters/comparison', np.min(number_of_clusters))
print('max clusters/comparison', np.max(number_of_clusters))
print('avg clusters/comparison', np.average(number_of_clusters))

In [None]:
# are there empty clusters in terms of predicates ? i.e. are there clusters that only have EMPTY comparison (uncompared papers) ?

n_empty_clusters = 0
for cluster_label in np.unique(model.labels_):
    cluster_instances_indices = np.argwhere(model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)
    cluster_instances = train_df.iloc[cluster_instances_indices]
    cluster_instances = cluster_instances.drop_duplicates(subset='paper_id')
    cluster_comparisons = cluster_instances['comparison_id'].unique()

    if len(cluster_comparisons) == 1 and 'EMPTY' in cluster_comparisons:
      n_empty_clusters += 1

print('#empty clusters: {}'.format(n_empty_clusters))