In [119]:
import pandas as pd
from snorkel.labeling import labeling_function,PandasLFApplier, LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter, LabelModel

In [120]:
import csv
names = ["qid", 'query', "did", 'url']
df = pd.read_csv("../data/input/orcas_small.tsv", sep='\t', names=names, quoting=csv.QUOTE_NONE)

In [121]:
df['query'] = df['query'].astype(str)

In [122]:
INSTRUMENTAL = 1
FACTUAL = 0
ABSTAIN = -1

In [123]:
df_train = df.sample(frac=0.5)
df_test = df[~df.index.isin(df_train.index)]

In [124]:
print(df_train.shape)
print(df_test.shape)
print(df.shape)

(50000, 4)
(50000, 4)
(100000, 4)


In [125]:
from snorkel.preprocess.nlp import SpacyPreprocessor

# The SpacyPreprocessor parses the text in text_field and
# stores the new enriched representation in doc_field
spacy = SpacyPreprocessor(text_field="query", doc_field="doc", memoize=True)

In [126]:
@labeling_function(pre=[spacy])
def lf_is_verb(x):
    if(x.doc[0].pos_ == "VERB" and x.doc[0].text == x.doc[0].lemma_):
        return INSTRUMENTAL
    else:
        return ABSTAIN

In [127]:
@labeling_function()
def lf_keyword_lookup(x):
  keywords = ["why","what","when","who","where","how"]
  return FACTUAL if any(word in x.query.lower() and "how to" not in x.query.lower() for word in keywords) else ABSTAIN

In [128]:
@labeling_function()
def lf_howto(x):
  keywords = ["how to"]
  return INSTRUMENTAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [129]:
import re
@labeling_function()
def lf_digit(x):
    return FACTUAL if re.search(r"\d", x.query, flags=re.I) else ABSTAIN

In [130]:
from snorkel.preprocess.nlp import SpacyPreprocessor
spacy = SpacyPreprocessor(text_field="query", doc_field="doc", memoize=True)

In [131]:
lfs = [lf_keyword_lookup,lf_howto,lf_digit,lf_is_verb]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)

100%|██████████████████████████████████| 100000/100000 [09:28<00:00, 175.80it/s]


In [132]:
L_train

array([[-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       ...,
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1]])

In [133]:
LFAnalysis(L=L_train,lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_keyword_lookup,0,[0],0.08428,0.00383,0.00042
lf_howto,1,[1],0.02085,0.00104,0.00098
lf_digit,2,[0],0.06255,0.00776,0.00435
lf_is_verb,3,[1],0.05793,0.0038,0.00374


In [134]:
# Build the model
label_model = LabelModel(cardinality=2,verbose=True)
label_model.fit(L_train=L_train,n_epochs=500,log_freq=100,seed=123)

In [135]:
df['Labels'] = label_model.predict(L=L_train,tie_break_policy="abstain")

In [136]:
df["Labels"].value_counts()

-1    78263
 0    13870
 1     7867
Name: Labels, dtype: int64

In [137]:
df[df.Labels == FACTUAL]

Unnamed: 0,qid,query,did,url,Labels
3,8991004,what happened to general electric,D119293,https://en.wikipedia.org/wiki/General_Electric,0
18,11286698,2018 printable philadelphia eagles schedule,D361329,http://printableteamschedules.com/NFL/philadel...,0
26,6905160,how is citric acid made,D354672,https://en.wikipedia.org/wiki/Citric_acid,0
45,4251659,hudson valley 10 day weather,D1164954,http://hudsonvalley.news12.com/weather,0
46,6220762,size 34 in us,D3510418,http://www.usatourist.com/english/traveltips/s...,0
...,...,...,...,...,...
99939,8879707,what did apple,D1728419,https://support.apple.com/apple-id,0
99970,11690984,what are edamame beans,D2953467,https://www.thespruce.com/what-is-edamame-3376830,0
99977,11841957,what does a cavity feel like,D2191432,https://www.livescience.com/44223-cavities-too...,0
99979,6637930,how many carbs in alcohol chart,D949888,http://getdrunknotfat.com/,0


In [138]:
df[df.Labels == INSTRUMENTAL]

Unnamed: 0,qid,query,did,url,Labels
7,8195949,convert mcg to mg,D218554,http://www.thecalculatorsite.com/conversions/c...,1
9,10385598,calculate my cumulative college gpa,D838604,http://gpacalculator.net/college-gpa-calculator/,1
20,3810622,boei,D3286390,https://acronyms.thefreedictionary.com/BOEI,1
28,12096848,setup new email,D761831,https://support.google.com/mail/answer/56256?h...,1
32,2138980,instigate meaning,D1653637,https://www.merriam-webster.com/dictionary/ins...,1
...,...,...,...,...,...
99934,6918133,how to put nanny on resume,D2377862,http://work.chron.com/add-nanny-experience-res...,1
99949,11721387,read my essay,D269418,https://ttsreader.com/,1
99953,3272120,asteroid names,D288198,https://en.wikipedia.org/wiki/Asteroid,1
99957,3293413,set up an email address,D789518,https://uk.mail.yahoo.com/,1


In [63]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("make nerve in neck")

if(doc[0].pos_ == "VERB" and doc[0].text == doc[0].lemma_):
    print(doc[0].text)

make
