Step 1: install dependencies

In [156]:
!pip3 install tira snorkel wows-eval textdistance rank-bm25 rapidfuzz



In [157]:
from tira.rest_api_client import Client
tira = Client()
DATASET_ID = 'wows-eval/pairwise-smoke-test-20250210-training'
input_data = tira.pd.inputs(DATASET_ID)
input_data

Unnamed: 0,id,query,relevant,unknown
0,3d080873-98a1-4388-af86-fe2c8b47ebca,who sings monk theme song,exists and is an alternate of . The Monk theme...,Randy Newman (album) Randy Newman is the debut...
1,468a9e92-467f-47c9-810b-fe6fa9dca634,who sings monk theme song,exists and is an alternate of . The Monk theme...,One of Monk's most important contributions to ...
2,846a69d0-0c0e-4d86-baf2-c3e8d31fdc86,who sings monk theme song,exists and is an alternate of . The Monk theme...,"Singing elegant, melancholic songs in a glamor..."
3,83c0e22c-b00f-4570-b1cb-027199c673d4,who sings monk theme song,exists and is an alternate of . The Monk theme...,"Walker, Texas Ranger. Chuck Norris thought “Ey..."
4,a88a0a31-4795-4c59-830b-848de52a7fd6,who sings monk theme song,exists and is an alternate of . The Monk theme...,This is a reference to the minor controversy t...
5,19f3efb5-e3ba-460a-b630-9ed3c20ada6f,who sings monk theme song,exists and is an alternate of . The Monk theme...,What is the music in chapter 33 of House of Ca...
6,50131d5a-6415-43bd-8850-a41fcabf54e2,who sings monk theme song,exists and is an alternate of . The Monk theme...,"However, as Brave 's soundtrack reveals, the m..."
7,bbf50ee5-9021-494f-8a4c-14e3355d989a,who sings monk theme song,It's a Jungle Out There is a song written by R...,Randy Newman (album) Randy Newman is the debut...
8,c8f4b93c-982e-4898-a8d8-d4b9dcdbd353,who sings monk theme song,It's a Jungle Out There is a song written by R...,One of Monk's most important contributions to ...
9,51053bb7-1f15-46fa-8891-440937d8b915,who sings monk theme song,It's a Jungle Out There is a song written by R...,"Singing elegant, melancholic songs in a glamor..."


Step 2: load all the libraries

In [158]:
from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.model.label_model import LabelModel
from tira.rest_api_client import Client
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from wows_eval import evaluate as wows_evaluate
from tira_measure import Environment
from rapidfuzz import fuzz
import numpy as np
import textdistance
import pandas as pd
tira = Client()
vectorizer = TfidfVectorizer()
import re

In [159]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Step 3: define global variables

In [160]:
# Global Variables
min_bm25 = None
max_bm25 = None
bm25 = None


In [161]:
# Global Min/Max Values (These should be computed dynamically)
GLOBAL_MIN = -1
GLOBAL_MAX = 1


Step 4: define helping functions

In [162]:

def tokenize(text):
    """Tokenizes text into lowercase words without punctuation."""
    return re.findall(r'\w+', text.lower())

def tokenize_and_lemmatize(text):
    """Tokenizes text into lowercase words without punctuation and lemmatizes words."""
    tokens = re.findall(r'\w+', text.lower())  # Tokenization
    return {lemmatizer.lemmatize(word) for word in tokens} - stop_words

def get_ngrams(text, n=2):
    """Generate n-grams from text"""
    tokens = list(tokenize_and_lemmatize(text))
    return set(zip(*[tokens[i:] for i in range(n)])) if len(tokens) >= n else set()

Step 5: Create Snorkel labeling functions

In [163]:
from snorkel.labeling import labeling_function
from rank_bm25 import BM25Okapi

@labeling_function()
def bm25_prob_diff(input_data):
    query = input_data["query"]
    relevant_doc = input_data["relevant"]
    unknown_doc = input_data["unknown"]

    # Tokenize query and documents
    tokenized_query = tokenize(query)
    tokenized_relevant = tokenize(relevant_doc)
    tokenized_unknown = tokenize(unknown_doc)

    # Compute BM25 scores separately for "relevant" and "unknown"
    bm25_relevant_model = BM25Okapi([tokenized_relevant])  # BM25 model with relevant doc
    bm25_unknown_model = BM25Okapi([tokenized_unknown])  # BM25 model with unknown doc

    bm25_relevant = bm25_relevant_model.get_scores(tokenized_query)[0]  # Score for relevant
    bm25_unknown = bm25_unknown_model.get_scores(tokenized_query)[0]  # Score for unknown

    # Compute difference without normalization
    diff = bm25_relevant - bm25_unknown
    return diff  # Return raw difference



@labeling_function()
def jaccard_similarity_prob_diff(input_data):
    query = input_data["query"]
    relevant_doc = input_data["relevant"]
    unknown_doc = input_data["unknown"]

    # Concatenate Query + Document
    query_rel = f"{query} {relevant_doc}"
    query_unk = f"{query} {unknown_doc}"

    # Compute Jaccard Similarity
    def jaccard_sim(text1, text2):
        set1, set2 = set(text1.split()), set(text2.split())
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        return intersection / union if union != 0 else 0

    jaccard_relevant = jaccard_sim(query, query_rel)
    jaccard_unknown = jaccard_sim(query, query_unk)

    diff = jaccard_relevant - jaccard_unknown
    return diff  # Return raw difference


@labeling_function()
def word_level_levenshtein_prob_diff(input_data):
    query = input_data["query"]
    relevant_doc = input_data["relevant"]
    unknown_doc = input_data["unknown"]

    # Compute Levenshtein similarity
    lev_relevant = fuzz.ratio(query, relevant_doc) / 100
    lev_unknown = fuzz.ratio(query, unknown_doc) / 100

    # Compute difference
    diff = lev_relevant - lev_unknown
    return diff  # Return raw difference


@labeling_function()
def tfidf_cosine_similarity_prob_diff(input_data):
    query = input_data["query"]
    relevant_doc = input_data["relevant"]
    unknown_doc = input_data["unknown"]

    # Concatenate Query + Document
    query_rel = f"{query} {relevant_doc}"
    query_unk = f"{query} {unknown_doc}"

    # Compute TF-IDF Cosine Similarity
    docs = [query, query_rel, query_unk]
    tfidf_matrix = vectorizer.fit_transform(docs)

    tfidf_relevant = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    tfidf_unknown = cosine_similarity(tfidf_matrix[0], tfidf_matrix[2])[0][0]

    diff = tfidf_relevant - tfidf_unknown
    return diff  # Return raw difference

bert_model = SentenceTransformer('all-MiniLM-L6-v2')


@labeling_function()
def bert_cosine_similarity_prob_diff(input_data):
    query = input_data["query"]
    relevant_doc = input_data["relevant"]
    unknown_doc = input_data["unknown"]

    # Concatenate Query + Document
    query_rel = f"{query} [SEP] {relevant_doc}"
    query_unk = f"{query} [SEP] {unknown_doc}"

    # Compute BERT Embeddings
    embeddings = bert_model.encode([query, query_rel, query_unk])

    bert_relevant = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
    bert_unknown = np.dot(embeddings[0], embeddings[2]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[2]))

    diff = bert_relevant - bert_unknown
    return diff  # Return raw difference



Step 6: train Snorkel

In [164]:
lfs = [bm25_prob_diff, word_level_levenshtein_prob_diff,tfidf_cosine_similarity_prob_diff, bert_cosine_similarity_prob_diff, jaccard_similarity_prob_diff]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(input_data)

#print("Unique Values in L_train:", np.unique(L_train))

#previous cardinality is 4

label_model = LabelModel(cardinality=4, verbose=True)
label_model.fit(L_train, n_epochs=100, log_freq=10)

train_prob = label_model.predict_proba(L_train)
train_prob_relevant = train_prob[:, 1]

input_data["probability_relevant"] = train_prob_relevant
#print(input_data)

#print(input_data["probability_relevant"].describe())



100%|██████████| 39/39 [00:13<00:00,  2.87it/s]
100%|██████████| 100/100 [00:00<00:00, 478.69epoch/s]


In [165]:
# Convert L_train to a DataFrame with LF names as columns
df_L = pd.DataFrame(L_train, columns=[lf.name for lf in lfs])
#print("Sample Labeling Function Outputs:\n", df_L.head())

lf_stats = df_L.describe().T  # Transpose so LFs are rows
#print("\nLabeling Function Statistics:\n", lf_stats)

Step 7: Use wows_evaluate

In [166]:
with Environment().measure() as tracked_experiment:
    # now we do the "computation"
    predictions = input_data.copy()

In [167]:
wows_evaluate(
    predictions,
    DATASET_ID,
    environment=tracked_experiment,
    upload=True,
    system_name='snorkel-pairwise',
    system_description='A snorkel pairwise approach'
)

Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/28613c50-2d13-42ec-93f2-64dda0c3c98c


Unnamed: 0,system,tau_ap,kendall,spearman,pearson
0,snorkel-pairwise,0.641667,0.561905,0.685714,0.685714
