In [1]:
"""
Multilabel active learning algorithms (query strategies): 
  1) Binary Minimization (BinMin)
     Calculates the uncertainty of each label independently
  2) Maximal Loss Reduction with Maximal Confidence (MMC)
     Uncertainty based upon the difference between predictions from two
     different multilabel classifiers. MMC uses two multilabel classifiers:
     i) binary relevance
     ii) stacked logistic regression
  3) Multilabel Active Learning With Auxiliary Learner (MLALAL)
     As with MMC, uncertainty based upon the difference between predictions 
     from two different multilabel classifiers
  4) Random Sampling (used as a baseline for comparision)
  TO DO 5) Adaptive Active Learning (multilabel/adaptive_active_learning.py)

Query Criteria (i.e. options for evaluating the prediction differences) 
used for MMC and MLALAL query strategies
i) Hamming Loss Reduction (HLR)
ii) Soft Hamming Loss Reduction (SHLR)
iii) Maximum Margin Reduction (MMR)

The Maluuba dataset contains 1367 conversations, which gives us 19,984 
examples. Each of these examples can have upto 20 labels

"""

import os

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score

from word_vec_fn import full_trn_tst, experiments

np.random.seed(0)

In [None]:
data_in = './data/maluuba/data_frames.json'


trn_ds, tst_ds, fully_labeled_trn_ds = full_trn_tst(data_in,
                                                    test_size=0.25, 
                                                    num_labelled = 10)



Sentences and labels loaded.


In [None]:
# After each of the 10 queries the classifier will be retrained
quota=10

results = experiments(fully_labeled_trn_ds, trn_ds, tst_ds, quota)


before qs1
before qs2
before qs3


In [None]:
query_num = np.arange(1, quota + 1)

fig = plt.figure(figsize=(12, 3)) # increase plot size
plt.plot(query_num, results['MMC'], 'g', label='MMC')
plt.plot(query_num, results['Random'], 'k', label='Random')
plt.plot(query_num, results['Aux_hlr'][0], 'r', label='AuxiliaryLearner_hlr')
plt.plot(query_num, results['Aux_shlr'][0], 'b', label='AuxiliaryLearner_shlr')
plt.plot(query_num, results['Aux_mmr'][0], 'c', label='AuxiliaryLearner_mmr')
plt.plot(query_num, results['BinMin'][0], 'm', label='BinaryMinimization')

plt.title('Experiment Result (Hamming Loss)', fontsize=12)
plt.xlabel('Number of Queries', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend(fontsize=10)
plt.show()

In [None]:
# 1) How many queries would it take to get 90% of the (hamming) loss, we would get if we had access to all labelled samples?
# 2) Add hyperparameters (taken as arguments at the command line) for removal of stop words


