In [1]:
import sys
sys.path.append('../..')

In [2]:
import re
import enum
import numpy as np

from spear.labeling import labeling_function, LFSet, ABSTAIN, preprocessor
from examples.TREC.preprocessor import convert_to_lower

In [3]:
label_map = {"DESC": "DESCRIPTION",
            "ENTY": "ENTITY",
            "HUM": "HUMAN",
            "ABBR": "ABBREVIATION",
            "LOC": "LOCATION",
            "NUM": "NUMERIC"}

class ClassLabels(enum.Enum):
    DESCRIPTION     = 0
    ENTITY          = 1
    HUMAN           = 2
    ABBREVIATION    = 3
    LOCATION        = 4
    NUMERIC         = 5

In [4]:
def load_rules(file_name='rules.txt'):
    rules = LFSet("TREC_LFS")
    
    with open(file_name, 'r', encoding='latin1') as f:
        i = 0
        for line in f:
            list_in = line.strip().split("\t")
            label = ClassLabels[label_map[list_in[0]]]
            pattern = list_in[1]
            rule_name = "rule"+str(i)
            
            @labeling_function(name=rule_name,resources=dict(pattern=pattern,output=label),pre=[convert_to_lower],label=label)
            def f(x,**kwargs):
                result = re.findall(kwargs["pattern"], x)
                if result:
                    return kwargs["output"]
                else:
                    return ABSTAIN

            rules.add_lf(f)
            i = i+1
    return rules

rules = load_rules()

In [5]:
LABEL_DICT = {"DESCRIPTION": 0, "ENTITY": 1, "HUMAN": 2, "ABBREVIATION": 3, "LOCATION": 4, "NUMERIC": 5}

def load_data(mode):
    label_map = {"DESC": 0, "ENTY": 1, "HUM": 2, "ABBR": 3, "LOC": 4,"NUM": 5}
    data = []

    with open(mode + '.txt', 'r', encoding='latin1') as f:
        for line in f:
            label = LABEL_DICT[label_map[line.split()[0].split(":")[0]]]
            if mode == "test":
                sentence = (" ".join(line.split()[1:]))
            else:
                sentence = (" ".join(line.split(":")[1:])).lower().strip()
            data.append((sentence, label))
    return data

In [6]:
from spear.labeling import preprocessor

@preprocessor()
def convert_to_lower(x):
    return x.lower().strip()

In [7]:
from spear.labeling import PreLabels
from notebooks.TREC.utils import load_data_to_numpy

X_V, X_feats_V, Y_V = load_data_to_numpy(file_name='valid.txt')
X_T, X_feats_T, Y_T = load_data_to_numpy(file_name='test.txt')
X, X_feats, Y = load_data_to_numpy(file_name='train.txt')

In [8]:
X_L, X_feats_L, Y_L, X_U, X_feats_U, Y_U = X[:100] , X_feats[:100], Y[:100], X[100:], X_feats[100:], Y[100:]

In [9]:
X_L.shape, X_U.shape, X_V.shape, X_T.shape

((100,), (5352,), (500,), (500,))

In [10]:
# X, X_feats, Y = load_data_to_numpy()
# Y = np.array([ClassLabels[x].value for x in Y])

# trec_noisy_labels = PreLabels(name="trec",
#                                data=X,
#                                gold_labels=Y,
#                                data_feats=X_feats,
#                                rules=rules,
#                                labels_enum=ClassLabels,
#                                num_classes=6)
# L,S = trec_noisy_labels.get_labels()

In [11]:
# from helper.utils import load_data_to_numpy, get_various_data

# X_V = 

# X, X_feats, Y = load_data_to_numpy()

# validation_size = 152
# test_size = 500
# L_size = 100
# U_size = 4700
# n_lfs = len(rules.get_lfs())

# X_V, Y_V, X_feats_V,_, X_T, Y_T, X_feats_T,_, X_L, Y_L, X_feats_L,_, X_U, X_feats_U,_ = get_various_data(X, Y,\
#     X_feats, n_lfs, validation_size, test_size, L_size, U_size)

In [10]:
path_json = 'data_pipeline/trec_json.json'
V_path_pkl = 'data_pipeline/trec_pickle_V.pkl' #validation data - have true labels
T_path_pkl = 'data_pipeline/trec_pickle_T.pkl' #test data - have true labels
L_path_pkl = 'data_pipeline/trec_pickle_L.pkl' #Labeled data - have true labels
U_path_pkl = 'data_pipeline/trec_pickle_U.pkl' #unlabelled data - don't have true labels

log_path_cage_1 = 'log/trec_cage_log_1.txt' #cage is an algorithm, can be found below
log_path_jl_1 = 'log/trec_jl_log_1.txt' #jl is an algorithm, can be found below

In [11]:
from spear.labeling import PreLabels

trec_noisy_labels = PreLabels(name="trec",
                               data=X_V,
                               gold_labels=Y_V,
                               data_feats=X_feats_V,
                               rules=rules,
                               labels_enum=ClassLabels,
                               num_classes=6)
trec_noisy_labels.generate_pickle(V_path_pkl)
trec_noisy_labels.generate_json(path_json) #generating json files once is enough

trec_noisy_labels = PreLabels(name="trec",
                               data=X_T,
                               gold_labels=Y_T,
                               data_feats=X_feats_T,
                               rules=rules,
                               labels_enum=ClassLabels,
                               num_classes=6)
trec_noisy_labels.generate_pickle(T_path_pkl)

trec_noisy_labels = PreLabels(name="trec",
                               data=X_L,
                               gold_labels=Y_L,
                               data_feats=X_feats_L,
                               rules=rules,
                               labels_enum=ClassLabels,
                               num_classes=6)
trec_noisy_labels.generate_pickle(L_path_pkl)

trec_noisy_labels = PreLabels(name="trec",
                               data=X_U,
                               rules=rules,
                               data_feats=X_feats_U,
                               labels_enum=ClassLabels,
                               num_classes=6)
trec_noisy_labels.generate_pickle(U_path_pkl)

100%|██████████| 500/500 [00:00<00:00, 2652.77it/s]
100%|██████████| 500/500 [00:00<00:00, 4235.56it/s]
100%|██████████| 100/100 [00:00<00:00, 3447.73it/s]
100%|██████████| 5352/5352 [00:01<00:00, 4105.45it/s]


In [12]:
from spear.utils import get_data, get_classes

data_U = get_data(path = U_path_pkl, check_shapes=True)
#check_shapes being True(above), asserts for relative shapes of arrays in pickle file
print("Number of elements in data list: ", len(data_U))
print("Shape of feature matrix: ", data_U[0].shape)
print("Shape of labels matrix: ", data_U[1].shape)
print("Shape of continuous scores matrix : ", data_U[6].shape)
print("Total number of classes: ", data_U[9])

classes = get_classes(path = path_json)
print("Classes dictionary in json file(modified to have integer keys): ", classes)

Number of elements in data list:  10
Shape of feature matrix:  (5352, 1024)
Shape of labels matrix:  (5352, 68)
Shape of continuous scores matrix :  (5352, 68)
Total number of classes:  6
Classes dictionary in json file(modified to have integer keys):  {0: 'DESCRIPTION', 1: 'ENTITY', 2: 'HUMAN', 3: 'ABBREVIATION', 4: 'LOCATION', 5: 'NUMERIC'}


In [13]:
# from spear.JL import JL

# n_features = 1024
# n_hidden = 512
# feature_model = 'lr'

# jl = JL(path_json = path_json, n_lfs = n_lfs, n_features = n_features, n_hidden = n_hidden, \
#         feature_model = feature_model)

In [14]:
from spear.jl import JL

loss_func_mask = [1,1,1,1,1,1,1]
'''
One can keep 0s in places where he don't want the specific loss function to be part
the final loss function used in training. Refer documentation(spear.JL.core.JL) to understand
the which index of loss_func_mask refers to what loss function.
Note: the loss_func_mask may not be the optimal mask for sms dataset.
'''
batch_size = 150
lr_fm = 0.0005
lr_gm = 0.01
use_accuracy_score = False
feature_model = 'nn'
n_features = 1024
n_hidden = 512
n_lfs = len(rules.get_lfs())
jl = JL(path_json = path_json, n_lfs = n_lfs, n_features = n_features, n_hidden = n_hidden, \
        feature_model = feature_model)

probs_fm, probs_gm = jl.fit_and_predict_proba(path_L = L_path_pkl, path_U = U_path_pkl, path_V = V_path_pkl, \
        path_T = T_path_pkl, loss_func_mask = loss_func_mask, batch_size = batch_size, lr_fm = lr_fm, lr_gm = \
    lr_gm, use_accuracy_score = use_accuracy_score, path_log = log_path_jl_1, return_gm = True, n_epochs = \
    100, start_len = 7,stop_len = 10, is_qt = True, is_qc = True, qt = 0.9, qc = 0.85, metric_avg = 'macro')

labels = np.argmax(probs_fm, 1)
print("probs_fm shape: ", probs_fm.shape)
print("probs_gm shape: ", probs_gm.shape)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  3%|▎         | 3/100 [01:55<1:02:06, 38.42s/it]


KeyboardInterrupt: 