Step 1: install dependencies

In [1]:
!pip3 install tira snorkel wows-eval textdistance rank-bm25

Collecting tira
  Downloading tira-0.0.144-py3-none-any.whl.metadata (4.6 kB)
Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting wows-eval
  Downloading wows_eval-0.0.2-py3-none-any.whl.metadata (1.3 kB)
Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting docker==7.*,>=7.1.0 (from tira)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Collecting auto-ir-metadata==0.0.0 (from wows-eval)
  Downloading auto_ir_metadata-0.0.0-py3-none-any.whl.metadata (806 bytes)
Collecting trectools (from wows-eval)
  Downloading trectools-0.0.50.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tira-measure==0.0.1 (from auto-ir-metadata==0.0.0->wows-eval)
  Downloading tira_meas

Step 2: Load the data

In [2]:
from tira.rest_api_client import Client
tira = Client()
DATASET_ID = 'wows-eval/pointwise-smoke-test-20250128-training'
input_data = tira.pd.inputs(DATASET_ID)
input_data

Download: 5.21kiB [00:00, 3.53MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/wows-eval/pointwise-smoke-test-20250128-training/





Unnamed: 0,id,query,unknown
0,32d23068-7440-4891-9958-42325f98a604,who sings monk theme song,This is a reference to the minor controversy t...
1,cde83146-ac3e-4bc5-a959-f2006ac7b8de,who sings monk theme song,"Walker, Texas Ranger. Chuck Norris thought “Ey..."
2,cb7b20d0-def6-46c4-ae44-a78f00b47735,who sings monk theme song,"However, as Brave 's soundtrack reveals, the m..."
3,4a68c86f-64ea-4293-bda5-7a0130c13864,who sings monk theme song,"Singing elegant, melancholic songs in a glamor..."
4,3e550de5-a104-44ae-bc3f-7ab556cc1018,who sings monk theme song,One of Monk's most important contributions to ...
5,e4b67a23-92ce-478a-887d-ca8170361145,who sings monk theme song,What is the music in chapter 33 of House of Ca...
6,66fd5e47-1928-45fb-812f-9b32e3f87c65,who sings monk theme song,Randy Newman (album) Randy Newman is the debut...
7,1fc0289f-2ac3-49ac-a612-535d2e43a378,what is the most popular food in switzerland,All cuisines have their most popular dishes. E...
8,638bc1bc-c26d-46ae-b401-f8b0b4794d6f,what is the most popular food in switzerland,"Originating in Switzerland, gruyere gruyère ch..."
9,f474a0a7-3422-4fc2-aeb2-4af24d038622,what is the most popular food in switzerland,St. Nicholas Day Foods! My dad is from Switzer...


Step 3: load all the libraries

In [7]:
from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.model.label_model import LabelModel
from tira.rest_api_client import Client
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from wows_eval import evaluate as wows_evaluate
from tira_measure import Environment
import numpy as np
import textdistance
import pandas as pd
tira = Client()
vectorizer = TfidfVectorizer()
import re

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Step 4: define global variables

In [88]:
# Global Variables
min_bm25 = None
max_bm25 = None
bm25 = None


Step 5: do a min-max normalisation for BM25, functions that tokenise and precompute n-grams

In [89]:

def tokenize(text):
    """Tokenizes text into lowercase words without punctuation."""
    return re.findall(r'\w+', text.lower())

def compute_global_bm25_stats(df):
    """Computes min, max, and avg BM25 scores for normalization."""
    global min_bm25, max_bm25, avg_bm25, bm25

    tokenized_corpus = [tokenize(doc) for doc in df["unknown"]]
    bm25 = BM25Okapi(tokenized_corpus)

    # Compute BM25 scores for all queries in dataset
    all_scores = []
    for query in df["query"]:
        score = max(bm25.get_scores(tokenize(query)))
        all_scores.append(score)

    min_bm25 = min(all_scores)
    max_bm25 = max(all_scores)

def normalize_bm25(bm25_score):
    """Normalizes BM25 using min-max scaling."""
    if min_bm25 == max_bm25:
        return 0.0
    return (bm25_score - min_bm25) / (max_bm25 - min_bm25)


def tokenize_and_lemmatize(text):
    """Tokenizes text into lowercase words without punctuation and lemmatizes words."""
    tokens = re.findall(r'\w+', text.lower())  # Tokenization
    return {lemmatizer.lemmatize(word) for word in tokens} - stop_words

def get_ngrams(text, n=2):
    """Generate n-grams from text"""
    tokens = list(tokenize_and_lemmatize(text))
    return set(zip(*[tokens[i:] for i in range(n)])) if len(tokens) >= n else set()

Step 6: Create Snorkel labelling functions

In [90]:

@labeling_function()
def bm25_score_prob(x):
    """Computes BM25 score and normalizes it to a probability."""
    global bm25
    bm25_score = max(bm25.get_scores(tokenize(x.query)))
    return normalize_bm25(bm25_score)


@labeling_function()
def boolean_match_prob(x):
    """Returns 1 if at least one query term appears in the document."""
    return 1.0 if set(tokenize(x.query)) & set(tokenize(x.unknown)) else 0.0


@labeling_function()
def word_level_levenshtein_prob(x):
    """Computes a boosted word-level Levenshtein similarity"""
    query_words = tokenize_and_lemmatize(x.query)
    doc_words = tokenize_and_lemmatize(x.unknown)

    if not query_words or not doc_words:
        return 0.0  # Avoid division by zero

    matched_scores = []
    for q in query_words:
        similarities = [textdistance.levenshtein.normalized_similarity(q, d) for d in doc_words]
        best_match = max(similarities) if similarities else 0.0
        matched_scores.append(best_match)

    # Compute final score
    avg_similarity = sum(matched_scores) / len(matched_scores) if matched_scores else 0.0
    boosted_score = avg_similarity * (1.5 if avg_similarity > 0.8 else 1.0)

    return min(boosted_score, 1.0) #Scores in range [0,1]

@labeling_function()
def relaxed_bigram_match_prob(x):
    """Computes the proportion of query bigrams where at least one word appears in the document."""
    query_bigrams = get_ngrams(x.query, n=2)
    doc_bigrams = get_ngrams(x.unknown, n=2)

    if not query_bigrams:
        return 0.0  # Avoid division by zero

    matched_bigrams = set()
    unmatched_bigrams = set()

    for q_bigram in query_bigrams:
        # If either word in the bigram appears in the document, count it as a partial match
        if any(word in x.unknown for word in q_bigram):
            matched_bigrams.add(q_bigram)
        else:
            unmatched_bigrams.add(q_bigram)
    return len(matched_bigrams) / len(query_bigrams)  # Fraction of bigrams with at least one match



Step 7: Train Snorkel

In [91]:

compute_global_bm25_stats(input_data)

lfs = [bm25_score_prob, boolean_match_prob, relaxed_bigram_match_prob, word_level_levenshtein_prob]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(input_data)

print("Unique Values in L_train:", np.unique(L_train))

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=100, log_freq=10)

train_prob = label_model.predict_proba(L_train)
train_prob_relevant = train_prob[:, 1]

input_data["probability_relevant"] = train_prob_relevant
print(input_data)

print(input_data["probability_relevant"].describe())



100%|██████████| 13/13 [00:00<00:00, 516.65it/s]


Unique Values in L_train: [0 1]


100%|██████████| 100/100 [00:00<00:00, 1161.99epoch/s]

                                      id  \
0   32d23068-7440-4891-9958-42325f98a604   
1   cde83146-ac3e-4bc5-a959-f2006ac7b8de   
2   cb7b20d0-def6-46c4-ae44-a78f00b47735   
3   4a68c86f-64ea-4293-bda5-7a0130c13864   
4   3e550de5-a104-44ae-bc3f-7ab556cc1018   
5   e4b67a23-92ce-478a-887d-ca8170361145   
6   66fd5e47-1928-45fb-812f-9b32e3f87c65   
7   1fc0289f-2ac3-49ac-a612-535d2e43a378   
8   638bc1bc-c26d-46ae-b401-f8b0b4794d6f   
9   f474a0a7-3422-4fc2-aeb2-4af24d038622   
10  9919aa1d-97b9-4e3f-ac73-63c1109d2f30   
11  5e75f3c7-6494-47f0-9656-1738b00aadc7   
12  27b5fa4a-a82e-4b66-9e35-6d4db703b808   

                                           query  \
0                      who sings monk theme song   
1                      who sings monk theme song   
2                      who sings monk theme song   
3                      who sings monk theme song   
4                      who sings monk theme song   
5                      who sings monk theme song   
6                  




Step 8: Check the results for different functions

In [92]:
# Convert L_train to a DataFrame with LF names as columns
df_L = pd.DataFrame(L_train, columns=[lf.name for lf in lfs])
print("Sample Labeling Function Outputs:\n", df_L.head())

lf_stats = df_L.describe().T  # Transpose so LFs are rows
print("\nLabeling Function Statistics:\n", lf_stats)

Sample Labeling Function Outputs:
    bm25_score_prob  boolean_match_prob  relaxed_bigram_match_prob  \
0                1                   1                          0   
1                1                   1                          0   
2                1                   0                          0   
3                1                   0                          0   
4                1                   1                          0   

   word_level_levenshtein_prob  
0                            0  
1                            0  
2                            0  
3                            0  
4                            0  

Labeling Function Statistics:
                              count      mean       std  min  25%  50%  75%  \
bm25_score_prob               13.0  0.538462  0.518875  0.0  0.0  1.0  1.0   
boolean_match_prob            13.0  0.692308  0.480384  0.0  0.0  1.0  1.0   
relaxed_bigram_match_prob     13.0  0.000000  0.000000  0.0  0.0  0.0  0.0   
word_lev

Step 9: Use wows_evaluate

In [93]:
with Environment().measure() as tracked_experiment:
    # now we do the "computation"
    predictions = input_data.copy()

In [94]:
wows_evaluate(
    predictions,
    DATASET_ID,
    environment=tracked_experiment,
    upload=True,
    system_name='snorkel-pointwise',
    system_description='A snorkel pointwise approach'
)

Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/f2905141-d775-4bd7-859d-b89fef00da24


Unnamed: 0,system,tau_ap,kendall,spearman,pearson
0,snorkel-pointwise,0.522222,0.6,0.632143,0.632143
