In [None]:
import pandas as pd
from snorkel.labeling import labeling_function,PandasLFApplier, LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter, LabelModel

In [None]:
import csv
names = ["qid", 'query', "did", 'url']
df = pd.read_csv("../data/input/orcas_small.tsv", sep='\t', names=names, quoting=csv.QUOTE_NONE)

In [None]:
df.head()

In [None]:
df['query'] = df['query'].astype(str)

In [None]:
informational_start_words = [
    "why",
    "what",
    "when",
    "who",
    "where",
    "how",
    "is",
    "can",
    "do",
    "does",
]


In [None]:
from snorkel.preprocess.nlp import SpacyPreprocessor

# The SpacyPreprocessor parses the text in text_field and
# stores the new enriched representation in doc_field
spacy = SpacyPreprocessor(text_field="query", doc_field="doc", memoize=True)


In [None]:
# Constants for our labels
TRANSACTIONAL = 1
NAVIGATIONAL = 0
ABSTAIN =-1

In [None]:
df_train = df.sample(frac=0.5)
df_test = df[~df.index.isin(df_train.index)]

In [None]:
print(df_train.shape)
print(df_test.shape)
print(df.shape)

In [None]:
@labeling_function()
def lf_download_lookup(x):
  keywords = ["download", "obtain", "access", "earn", "redeem"]
  return TRANSACTIONAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_audio_video_lookup(x):
  keywords = ["audio", "video", "image", "images"]
  return TRANSACTIONAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
movies_df = pd.read_csv("../data/helpers/movies.csv")
movie_names_list = movies_df['title'].tolist()


@labeling_function(pre=[spacy])
def lf_movie_name_lookup(x):
    if x.doc[0].text.lower() in informational_start_words:
        return ABSTAIN
    else:
        return (
            TRANSACTIONAL
            if any(movie_name in x.query.lower() for movie_name in movie_names_list)
            else ABSTAIN
        )




In [None]:
@labeling_function()
def lf_extension_lookup(x):
  keywords = ["jpeg", "zip", "rar", "png", "mp3"]
  return TRANSACTIONAL if any(word in x.query.lower().split() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_transaction_lookup(x):
  keywords = ["online", "free", "transaction", "buy", "chat", "purchase", "shop for", "procure", "complimentary", "gratuitous", "payment"]
  return TRANSACTIONAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_www(x):
  keywords = ["www", "http", "https"]
  return NAVIGATIONAL if any(word in x.query.lower() for word in keywords) else ABSTAIN

In [None]:
@labeling_function()
def lf_login(x):
  keywords = ["login","signin","log in","sign in"]
  return NAVIGATIONAL if any(word in x.query.lower() for word in keywords) else ABSTAIN


In [None]:
@labeling_function(pre=[spacy])
def lf_has_ner(x):
    for ent in x.doc.ents:
        if ent.label_ in ["ORG", "PERSON"] and x.doc[0].text not in ["why","what","when","who","where","how", "is","can","do","does"]:
            return NAVIGATIONAL
    else:
        return ABSTAIN


In [None]:
### Apply Fxn
lfs = [lf_download_lookup, lf_audio_video_lookup,lf_movie_name_lookup, lf_extension_lookup, lf_transaction_lookup, lf_www, lf_login, lf_has_ner]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)

In [None]:
L_train

In [None]:
LFAnalysis(L=L_train,lfs=lfs).lf_summary()

In [None]:
LFAnalysis(L=L_train,lfs=lfs).lf_summary()

In [None]:
# Build the model
label_model = LabelModel(cardinality=2,verbose=True)
label_model.fit(L_train=L_train,n_epochs=500,log_freq=100,seed=123)


In [None]:

df['Labels'] = label_model.predict(L=L_train,tie_break_policy="abstain")

In [None]:
df["Labels"].value_counts()

In [None]:
df[df.Labels == TRANSACTIONAL]


In [None]:
df[df.Labels == NAVIGATIONAL]