In [None]:
import numpy as np 
from scipy.sparse import dok_matrix, vstack, csr_matrix
from scipy.sparse import dok_matrix, vstack, csr_matrix
from snorkel.labeling import PandasLFApplier, LFApplier, LFAnalysis, labeling_function

In [256]:
sents = []
iob_tags = []
text = ""
filename = './re3d/train.txt'
write_filename = "./re3d/sentences.txt"

with open(filename, 'r') as file:
    lst = []
    while True:
        word_lst = file.readline().lower().split("\t")
        word = word_lst[0]
        if not word:
            break
        text += " " + word

        if word == '\n':
            sents.append(lst)
            lst = []
        else:
            lst.append(word)
            iob_tags.append(word_lst[-1].split('\n')[0])

    sents.append(lst)

with open(write_filename, 'w') as f:
    f.write(text)
    f.close()

### Distinct Types of IOB Tokens

In [257]:
dist = set()
for tok in iob_tags:
    dist.add(tok)

print(dist)

{'i-weapon', 'b-documentreference', 'i-militaryplatform', 'b-militaryplatform', 'b-weapon', 'b-money', 'b-temporal', 'i-location', 'b-nationality', 'b-organisation', 'i-temporal', 'o', 'i-quantity', 'b-quantity', 'i-person', 'i-money', 'i-organisation', 'i-documentreference', 'b-person', 'i-nationality', 'b-location'}


In [258]:
# Here we are checking and ensuring that both number of tokens in sentences and number of tags in iob_tags are same
print('Number of iob tags are: ',len(iob_tags))

l=0
for i in range(len(sents)):
    l+=len(sents[i])

print('Number of tokens are: ',l)

Number of iob tags are:  20030
Number of tokens are:  20030


In [4]:
# converting iob tags to numerical tags
o = 0
b_org = 1
i_org = 2
b_quan = 3
i_quan = 4
b_weapon = 5
i_weapon = 6
b_doc = 7
i_doc = 8
b_person = 9
i_person = 10
b_nationality = 11
i_nationality = 12
b_money = 13
i_money = 14
b_temporal = 15
i_temporal = 16
b_loc = 17
i_loc = 18
b_milplat = 19
i_milplat = 20

iob_dict = {
    "o" : 0,
    "b_org" : 1,
    "i_org" : 2,
    "b_quan" : 3,
    "i_quan" : 4,
    "b_weapon" : 5,
    "i_weapon" : 6,
    "b_doc" : 7,
    "i_doc" : 8,
    "b_person" : 9,
    "i_person" : 10,
    "b_nationality" : 11,
    "i_nationality" : 12,
    "b_money" : 13,
    "i_money" : 14,
    "b_temporal" : 15,
    "i_temporal" : 16,
    "b_loc" : 17,
    "i_loc" : 18,
    "b_milplat" : 19,
    "i_milplat" : 20,
}

num_iob_tags = []
for tok in iob_tags:
    num_iob_tags.append(iob_dict[tok])

num_iob_tags = np.array(num_iob_tags)

In [5]:
# helper functions
def dict_match(sentence, dictionary, max_ngrams=4):
   m = {}
   for i in range(len(sentence)):
       for j in range(i+1, min(len(sentence), i + max_ngrams) + 1):
           term = ' '.join(sentence[i:j])
           if term in dictionary:
               m.update({idx:1 for idx in range(i,j+1)})
   return m
           
         
def create_token_L_mat(Xs, Ls, num_lfs):
   """
   Create token-level LF matrix from LFs indexed by sentence
   """
   Yws = []
   for sent_i in range(len(Xs)):
       ys = dok_matrix((len(Xs[sent_i]), num_lfs))
       for lf_i in range(num_lfs):
           for word_i,y in Ls[sent_i][lf_i].items():
               ys[word_i, lf_i] = y
       Yws.append(ys)
   return csr_matrix(vstack(Yws))


  
# labeling functions
def moneyLF(x):
   i_lst = ['dinar','dinars','dollars','million','millions','donor','jordanian','billion','billions']
   b_lst = ['$','£','wage','wages','salaries','money']

   sent_dict = {}
   for i in range(len(x)):
      if x[i] in i_lst:
         sent_dict[i] = i_money
      if x[i] in b_lst:
         sent_dict[i] = b_money

   for i in range(len(x)):
      if i+1< len(x) and sent_dict.get(i) != None and sent_dict[i] == b_money and sent_dict.get(i+1)==None:
         sent_dict[i+1] = i_money

      if i-1 >=0 and sent_dict.get(i) != None and sent_dict[i] == i_money and sent_dict.get(i-1)==None:
         sent_dict[i-1] = b_money  

   return sent_dict      



def quantityLF(x):
   i_lst = ['percent','miles','domains','million','millions','thousand','thousands','km','kms','mi','square','squares','m3','acre','acres','%','beds','plus','kilometers','kilometer']

   sent_dict = {}
   for i in range(len(x)):
      if x[i] in i_lst:
         sent_dict[i] = i_quan

   for i in range(len(x)):
      if i-1 >=0 and sent_dict.get(i) != None and sent_dict[i] == i_quan and sent_dict.get(i-1)==None:
         sent_dict[i-1] = b_quan 

   return sent_dict 

# document reference LF
def docLF(x):
   i_lst = ['book','books','chapter','resolution','text','texts','report','reports']
   b_lst = ['constitution','un','united nation','united nations','executive','executives']

   sent_dict = {}
   for i in range(len(x)):
      if x[i] in i_lst:
         sent_dict[i] = i_doc
      if x[i] in b_lst:
         sent_dict[i] = b_doc

   for i in range(len(x)):
      if i+1< len(x) and sent_dict.get(i) != None and sent_dict[i] == b_doc and sent_dict.get(i+1)==None:
         sent_dict[i+1] = i_doc

      if i-1 >=0 and sent_dict.get(i) != None and sent_dict[i] == i_doc and sent_dict.get(i-1)==None:
         sent_dict[i-1] = b_doc 

   return sent_dict      

def locLF(x):
   b_lst = ['in', 'near', 'above', 'over', 'by', 'along', 'around']

   sent_dict = {}
   for i in range(len(x)):
      if x[i] in b_lst:
         sent_dict[i] = b_loc
         if i+1<len(x):
            sent_dict[i+1] = i_loc

   return sent_dict

def keywordMatching(s, words_list, iTAG, bTAG):
   new_list = []
   label_dict = dict()
   for word in words_list:
      new_list.append(word.lower())
   tokens = s.lower()
   for i, word in enumerate(tokens):
      if word in new_list:
         if i > 0 and label_dict[i-1] == iTAG:
               label_dict[i] = iTAG
         else:
               label_dict[i] = bTAG
   return label_dict

def weaponLF(s):
   label_dict = dict()
   weapon_words_list = ['weapons','strikes','chemical','rocket','artillery','munitions','strike','VBIED','gun','mortar','propelled','heavy',
                            'machine','VBIEDs','bombs','grenade','missile','bomb','barrel','launcher','front','end','weapon','systems','IED']
   keywordMatching(s, weapon_words_list, i_weapon, b_weapon)


def nationalityLF(s):
   label_dict = dict()
   nationality_words_list =['Syrian','Iraqi','Kurdish','Yazidi','Arabic','Australian','Shia','Arab',
                        'Islam','Shiite','German','Jordan','Israeli','Muslims','Yazidis','Kurmanji','Northern']
   keywordMatching(s, nationality_words_list, i_nationality, b_nationality)

def orgLF(s):
   org_words_list = ['forces', 'Iraq', 'Syrian', 'Coalition', 'United', 'people', 'Iraqi', 'group', 'government', 'regime', 'tactical', 'Forces', 'States', 'Government', 'international', 'Security', 'coalition', 'partners', 'Council', 'UN', 'ISIL', 'military', 'fighters', 'State', 'Department', 'units', 'UK', 'esh', 'community', 'U.S.', 'Command', 'civilians', 'security', 'parties', 'members', 'Force', 'unit', 'SDF', 'terrorist', 'victims', 'officials', 'Arab', 'terrorists', 'leaders', 'Central', 'residents', 'Levant', 'allies', 'Democratic', 'opposition', 'Army', 'families', 'Assad', 'family', 'two', 'Asad', 'Iraqis', 'Islamic', 'Joint', 'Union', 'Qaida', 'Nations', 'Syria', 'Group', 'Members', 'Air', 'Battalion', 'officers', 'organization', 'Marines', 'enemy', 'Task', 'EU', 'women', 'nations', 'friends', 'European', 'Syrians', 'countries']
   return keywordMatching(s, org_words_list, i_org, b_org)  

def temporalLF(s):
   temporal_words_list = ['year', 'hours', 'days', 'December', 'years', '2006', 'week', '2016', 'November', 'past', 'last', 'day', 'months', 'Thanksgiving', 'Dec.', '15', '13', 'weeks', '2005', 'ago', 'Day', 'night', '26', 'end', '27', 'February', '2015', 'four', 'morning', '6', '10', '19', 'following', '2007', '7', 'festive', 'just', 'before', 'Muslim', 'Eid', 'celebration', '14th', 'later', '3', 'Z', 'Time', '2008', 'Sunday', 'May', '88', '91', 'first', 'long', '1979', 'January', 'eve', 'a', 'New', 'Year', '2013', 'month', 'a.m.', '29', '5', '1', '16', '18', 'coming', 'and', '2014', 'two', 'March', 'three', 'attack', 'minutes', 'June', 'from', 'late', 'to', 'early', 'evening', 'September', '20', 'plus', '6th', '7th', '25', '2011', 'summer', '2017', 'on', 'Thursday', '28', 'July', '2009', 'weekend', 'August', 'in', 'or', 'seven']
   return keywordMatching(s, temporal_words_list, i_temporal, b_temporal)   

lfs = [
   moneyLF,
   quantityLF,
   locLF,
   nationalityLF,
   weaponLF,
   docLF,
   orgLF,
   temporalLF
]

# apply labeling functions and transform label matrix 
L = [[lf(s) for lf in lfs] for s in sents] 

L = create_token_L_mat(sents, L, len(lfs))  
L = np.asarray(L.astype(np.int8).todense())

In [None]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=4, verbose=True)
label_model.fit(L_train=L, n_epochs=500, log_freq=100, seed=123)

label_model.predict(L)


In [None]:
# finding accuracy
label_model_acc = label_model.score(L=L, Y=num_iob_tags, tie_break_policy="random")["accuracy"]
print(label_model_acc)