# Training and Evaluation of the Predicates Clustering Service using TF-IDF Embeddings
Using this notebook you can simply build either **K-Means** or **Agglomerative** clusters using TF-IDF embeddings of your dataset and then evaluate your trained models on your test set.

Please store you data files in a Google Drive directory of yours(``MAIN_DRIVE_DIR``) and then provide their paths and the path (``RESULTS_PATH``) of where the results must be stored in that directory. Also please provide the name of your Google Storage Cloud bucket so that the models can be uploaded to and downloaded from it.


|       Variable       | Description |
|:--------------------:|:--------------------------------------------------------:|
|``MAIN_DRIVE_DIR`` | Name of your main directory in your Google Drive |
|  `TRAINING_SET_PATH` | Path to your training set inside the `MAIN_DRIVE_DIR `|
|  `TEST_SET_PATH` | Path to your test set inside the `MAIN_DRIVE_DIR `|
|  `DATA_PATH` | Path to your dataset inside the ``MAIN_DRIVE_DIR`` |
|  `RESULTS_PATH` | Path to the generated results file inside the `MAIN_DRIVE_DIR `|
|  `CLUSTERING_METHOD` | `kmeans` or `agglomerative`|
| `VECTORIZER_PATH` | Name of the pickle file of your vectorizer |
| `MODEL_TEMPLATE_PATH` | Template name of the model files which will be stored in your Google Cloud Storage. The default name has the schema `tfidf_{clustering method}_{k}.pkl` |
| `BUCKET` | The name of your Google Cloud Storage bucket |





In [None]:
# Constants
TRAINING_SET_PATH = './training_set.json'
TEST_SET_PATH = './test_set.json'
DATA_PATH = './dataset.json'
VECTORIZER_PATH = './tfidfvectorizer.pkl'
MAIN_DRIVE_DIR = 'TODO'
CLUSTERING_METHOD = 'kmeans' # or 'agglomerative'
RESULTS_PATH = './tfidf_{}_results.json'.format(CLUSTERING_METHOD)
EXPECTED_PREDICTED_PATH = './tfidf_{}_expected_predicted.json'.format(CLUSTERING_METHOD)
BUCKET = 'TODO'
MODEL_TEMPLATE_PATH = 'tfidf_{}_{}.pkl'.format(CLUSTERING_METHOD, '{}')

In [None]:
"""removes punctuation, stopwords, and returns a list of the remaining words, or tokens"""
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
import string
import json
import pickle
import math

def round_to_next_hundred(x):
  return int(math.ceil(x / 100.0)) * 100

def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Remove words
    '''
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    return ' '.join([stemmer.lemmatize(word) for word in nopunc])

def read_json(input_path):
    with open(input_path, encoding='utf-8') as f:
        json_data = json.load(f)

    return json_data

def read_pickle(input_path):
    with open(input_path, 'rb') as f:
        loaded_object = pickle.load(f)
    return loaded_object

def write_json(json_data, output_path):
    with open(output_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

def write_pickle(data, output_path):
    with open(output_path, 'wb') as f:
        pickle.dump(data, f)  

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TRAINING_SET_PATH $TRAINING_SET_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$TEST_SET_PATH $TEST_SET_PATH
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/'$DATA_PATH $DATA_PATH

# Training

In [None]:
# Import and process the training data
import pandas as pd

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])
train_df['text'] = train_df['text'].apply(text_process)

n_comparisons = round_to_next_hundred(train_df.comparison_id.unique().shape[0])
n_papers = round_to_next_hundred(train_df.paper_id.unique().shape[0])
N_CLUSTERS = range(n_comparisons, n_papers, 50)
print(list(N_CLUSTERS))


if CLUSTERING_METHOD == 'agglomerative':
  test_json = read_json(TEST_SET_PATH)
  test_df = pd.json_normalize(test_json['instances'])
  test_df['text'] = test_df['text'].apply(text_process)

In [None]:
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)).fit(train_df['text'])
write_pickle(vectorizer, VECTORIZER_PATH)

train_df_transformed = vectorizer.transform(train_df['text'])

# we need to build the clusters on the complete dataset, since the "prediction" in hierarchical clusterings requires re-building the clusters.
if CLUSTERING_METHOD == 'agglomerative':
  test_df_transformed = vectorizer.transform(test_df['text'])
  train_df_transformed = sparse.vstack((train_df_transformed, test_df_transformed))

train_df_transformed.shape

In [None]:
#checking for optimal number of clusters
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering
from time import time
from google.colab import auth
auth.authenticate_user()

for n in N_CLUSTERS:
    t0 = time()
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(n)
    print(MODEL_PATH)

    if CLUSTERING_METHOD == 'kmeans':
      clustering_model = KMeans(n_clusters=n, random_state=212)
      clustering_model = clustering_model.fit(train_df_transformed)
    elif CLUSTERING_METHOD == 'agglomerative':
      clustering_model = AgglomerativeClustering(n_clusters=n, linkage='ward')
      clustering_model = clustering_model.fit(train_df_transformed.toarray())

    print('{0:2f}'.format(time() - t0))
    write_pickle(clustering_model, MODEL_PATH)

    # Upload model to bucket
    !gsutil cp {MODEL_PATH} gs://{BUCKET}

# Evaluation

In [None]:
import numpy as np

def predict_comparisons(clustering_model, test_element, test_element_index, train_df):
    if CLUSTERING_METHOD == 'kmeans':
      cluster_label = clustering_model.predict(test_element)
      cluster_instances_indices = np.argwhere(clustering_model.labels_ == cluster_label).squeeze(1)
    elif CLUSTERING_METHOD == 'agglomerative':
      cluster_label = clustering_model.labels_[train_df.shape[0] + test_element_index]
      cluster_instances_indices = np.argwhere(clustering_model.labels_[:train_df.shape[0]] == cluster_label).squeeze(1)

    cluster_instances = train_df.iloc[cluster_instances_indices]
    comparison_ids = cluster_instances['comparison_id'].unique()
    return comparison_ids


def map_to_predicates(data, comparison_ids):
    predicate_ids = []
    
    for comparison in data['comparisons']:
      if comparison['id'] in comparison_ids:

        for predicate in comparison['predicates']:
          if predicate['id'] in predicate_ids:
            continue

          predicate_ids.append(predicate['id'])

    return predicate_ids

def evaluate_macro(expected, predicted):
    return compute_metrics(evaluate_micro(expected, predicted))

def evaluate_micro(expected, predicted):
    """
    tp: correctly predicted properties --> found in expected and predicted sets
    fp: incorrectly predicted properties --> found only in predicted set
    fn: incorrectly predicted properties for other classes -> found only in expected set
    """
    tp = len(set(expected).intersection(predicted))
    fp = len(set(predicted).difference(expected))
    fn = len(set(expected).difference(predicted))
    
    return np.array([tp, fp, fn])


def compute_metrics(confusion_results):
    tp, fp, fn = confusion_results

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_measure = 2 * ((precision * recall) / (precision + recall)) 
    
    return np.array([precision, recall, f_measure])

In [None]:
# Import and process the test data
import pandas as pd

train_json = read_json(TRAINING_SET_PATH)
train_df = pd.json_normalize(train_json['instances'])

test_json = read_json(TEST_SET_PATH)
test_df = pd.json_normalize(test_json['instances'])
test_df['text'] = test_df['text'].apply(text_process)

In [None]:
import os
from google.colab import auth
auth.authenticate_user()

data = read_json(DATA_PATH)
vectorizer = read_pickle(VECTORIZER_PATH)

results = {}
for n in N_CLUSTERS:
    MODEL_PATH = MODEL_TEMPLATE_PATH.format(n)
    print('evaluating model: {}'.format(MODEL_PATH))

    if not os.path.exists(MODEL_PATH):
        !gsutil cp gs://$BUCKET/$MODEL_PATH $MODEL_PATH

    try:    
      clustering_model = read_pickle(MODEL_PATH)
    except:
      continue

    macro_measures = np.empty((0,3), float)
    micro_measures = np.zeros(3)
    for test_instance_index, test_instance in test_df.iterrows():
        expected_comparison_id, text = test_instance['comparison_id'], test_instance['text']
        expected = map_to_predicates(data, [expected_comparison_id])
        vectorized_text = vectorizer.transform([text])

        predicted_comparison_ids = predict_comparisons(clustering_model, vectorized_text, test_instance_index, train_df)
        predicted = map_to_predicates(data, predicted_comparison_ids)
        macro_measures = np.vstack((macro_measures, evaluate_macro(expected, predicted)))
        micro_measures += evaluate_micro(expected, predicted)
    
    macro_measures = np.nanmean(macro_measures, axis=0)
    micro_measures = compute_metrics(micro_measures)
    results[str(n)] = {
        'k': n,
        'macro': {
            'precision': macro_measures[0],
            'recall': macro_measures[1],
            'f_measure': macro_measures[2]
        },
        'micro': {
            'precision': micro_measures[0],
            'recall': micro_measures[1],
            'f_measure': micro_measures[2]
        }
    }
    write_json(results, RESULTS_PATH)
    !cp $RESULTS_PATH '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/tfidf/'$RESULTS_PATH

In [None]:
!cp '/content/drive/MyDrive/'$MAIN_DRIVE_DIR'/tfidf/'$RESULTS_PATH $RESULTS_PATH


results = read_json(RESULTS_PATH)
results_df = pd.json_normalize(results.values())
results_df

Unnamed: 0,k,macro.precision,macro.recall,macro.f_measure,micro.precision,micro.recall,micro.f_measure
0,400,0.269173,0.845144,0.481503,0.062049,0.823111,0.115399
1,450,0.26224,0.807123,0.467562,0.046184,0.773715,0.087165
2,500,0.274906,0.804626,0.48694,0.084478,0.776533,0.152378
3,550,0.287113,0.796953,0.500554,0.069442,0.75903,0.127243
4,600,0.278936,0.773071,0.480268,0.05612,0.734332,0.104272
5,650,0.288118,0.765863,0.494907,0.051145,0.751613,0.095773
6,700,0.334216,0.788092,0.55027,0.089523,0.742416,0.159779
7,750,0.298055,0.766716,0.504257,0.050133,0.744419,0.09394
8,800,0.319702,0.755123,0.5367,0.077119,0.726322,0.139433
9,850,0.319544,0.741956,0.53118,0.072618,0.705184,0.131677
