In [None]:
import pandas as pd
from snorkel.labeling import labeling_function,PandasLFApplier, LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter, LabelModel

In [None]:
import csv
names = ["qid", 'query', "did", 'url']
df = pd.read_csv("../data/input/orcas_small.tsv", sep='\t', names=names, quoting=csv.QUOTE_NONE)

In [None]:
df['query'] = df['query'].astype(str)

In [None]:
INSTRUMENTAL = 1
FACTUAL = 0
ABSTAIN = -1

In [None]:
df_train = df.sample(frac=0.5)
df_test = df[~df.index.isin(df_train.index)]

In [None]:
print(df_train.shape)
print(df_test.shape)
print(df.shape)

In [None]:
from snorkel.preprocess.nlp import SpacyPreprocessor

# The SpacyPreprocessor parses the text in text_field and
# stores the new enriched representation in doc_field
spacy = SpacyPreprocessor(text_field="query", doc_field="doc", memoize=True)

In [None]:
@labeling_function(pre=[spacy])
def lf_is_verb(x):
    if(x.doc[0].pos_ == "VERB" and x.doc[0].text == x.doc[0].lemma_):
        return INSTRUMENTAL
    else:
        return ABSTAIN

@labeling_function(pre=[spacy])
def lf_is_verb(x):
    if any([token.pos_ == "VERB" and token.text == token.lemma_ for token in x.doc]):
        return INSTRUMENTAL
    else:
        return ABSTAIN

In [None]:
@labeling_function()
def lf_keyword_lookup(x):
  keywords = ["why","what","when","who","where","how"]
  return FACTUAL if any(word in x.query.lower() and "how to" not in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_question_words(x):
  keywords = ["is","can","do","does"]
  return FACTUAL if any(x.query.lower().startswith(word) for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_facts_lookup(x):
  keywords = ["facts","statistics","quantity","quantities"]
  return FACTUAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_finance_lookup(x):
  keywords = ["average","sum","cost","amount","salary","salaries","pay"]
  return FACTUAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_phone(x):
  keywords = ["number","phone","code","zip"]
  return FACTUAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_definition(x):
  keywords = ["define","definition","meaning"]
  return FACTUAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_howto(x):
  keywords = ["how to"]
  return INSTRUMENTAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
import re
@labeling_function()
def lf_digit(x):
    return FACTUAL if re.search(r"\d", x.query, flags=re.I) else ABSTAIN

In [None]:
from snorkel.preprocess.nlp import SpacyPreprocessor
spacy = SpacyPreprocessor(text_field="query", doc_field="doc", memoize=True)

In [None]:
lfs = [lf_keyword_lookup,lf_howto,lf_phone,lf_digit,lf_finance_lookup,lf_facts_lookup,lf_definition,lf_question_words,lf_is_verb]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)

In [None]:
L_train

In [None]:
LFAnalysis(L=L_train,lfs=lfs).lf_summary()

In [None]:
# Build the model
label_model = LabelModel(cardinality=2,verbose=True)
label_model.fit(L_train=L_train,n_epochs=500,log_freq=100,seed=123)

In [None]:
df['Labels'] = label_model.predict(L=L_train,tie_break_policy="abstain")

In [None]:
df["Labels"].value_counts()

In [None]:
df[df.Labels == FACTUAL]

In [None]:
df[df.Labels == INSTRUMENTAL]

In [None]:
import seaborn as sns

In [None]:
sns.countplot(x='Labels',data=df)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("make nerve in neck")

if(doc[0].pos_ == "VERB" and doc[0].text == doc[0].lemma_):
    print(doc[0].text)