Step 1: install dependencies

In [1]:
!pip3 install tira snorkel wows-eval textdistance rank-bm25

Collecting tira
  Downloading tira-0.0.147-py3-none-any.whl.metadata (4.7 kB)
Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting wows-eval
  Downloading wows_eval-0.0.2-py3-none-any.whl.metadata (1.3 kB)
Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting docker==7.*,>=7.1.0 (from tira)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Collecting auto-ir-metadata==0.0.0 (from wows-eval)
  Downloading auto_ir_metadata-0.0.0-py3-none-any.whl.metadata (806 bytes)
Collecting trectools (from wows-eval)
  Downloading trectools-0.0.50.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tira-measure==0.0.1 (from auto-ir-metadata==0.0.0->wows-eval)
  Downloading tira_meas

In [2]:
from tira.rest_api_client import Client
tira = Client()
DATASET_ID = 'wows-eval/pointwise-smoke-test-20250128-training'
input_data = tira.pd.inputs(DATASET_ID)
input_data

Download: 5.21kiB [00:00, 15.2MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/wows-eval/pointwise-smoke-test-20250128-training/





Unnamed: 0,id,query,unknown
0,32d23068-7440-4891-9958-42325f98a604,who sings monk theme song,This is a reference to the minor controversy t...
1,cde83146-ac3e-4bc5-a959-f2006ac7b8de,who sings monk theme song,"Walker, Texas Ranger. Chuck Norris thought “Ey..."
2,cb7b20d0-def6-46c4-ae44-a78f00b47735,who sings monk theme song,"However, as Brave 's soundtrack reveals, the m..."
3,4a68c86f-64ea-4293-bda5-7a0130c13864,who sings monk theme song,"Singing elegant, melancholic songs in a glamor..."
4,3e550de5-a104-44ae-bc3f-7ab556cc1018,who sings monk theme song,One of Monk's most important contributions to ...
5,e4b67a23-92ce-478a-887d-ca8170361145,who sings monk theme song,What is the music in chapter 33 of House of Ca...
6,66fd5e47-1928-45fb-812f-9b32e3f87c65,who sings monk theme song,Randy Newman (album) Randy Newman is the debut...
7,1fc0289f-2ac3-49ac-a612-535d2e43a378,what is the most popular food in switzerland,All cuisines have their most popular dishes. E...
8,638bc1bc-c26d-46ae-b401-f8b0b4794d6f,what is the most popular food in switzerland,"Originating in Switzerland, gruyere gruyère ch..."
9,f474a0a7-3422-4fc2-aeb2-4af24d038622,what is the most popular food in switzerland,St. Nicholas Day Foods! My dad is from Switzer...


Step 2: load all the libraries

In [3]:
from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.model.label_model import LabelModel
from tira.rest_api_client import Client
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from wows_eval import evaluate as wows_evaluate
from tira_measure import Environment
import numpy as np
import textdistance
import pandas as pd
tira = Client()
vectorizer = TfidfVectorizer()
import re

In [4]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Step 3: define global variables

In [5]:
# Global Variables
min_bm25 = None
max_bm25 = None
bm25 = None


Step 4: do a min-max normalisation for BM25, functions that tokenise and precompute n-grams

In [6]:

def tokenize(text):
    """Tokenizes text into lowercase words without punctuation."""
    return re.findall(r'\w+', text.lower())

def compute_global_bm25_stats(df):
    """Computes min, max, and avg BM25 scores for normalization."""
    global min_bm25, max_bm25, avg_bm25, bm25

    tokenized_corpus = [tokenize(doc) for doc in df["unknown"]]
    bm25 = BM25Okapi(tokenized_corpus)

    # Compute BM25 scores for all queries in dataset
    all_scores = []
    for query in df["query"]:
        score = max(bm25.get_scores(tokenize(query)))
        all_scores.append(score)

    min_bm25 = min(all_scores)
    max_bm25 = max(all_scores)

def normalize_bm25(bm25_score):
    """Normalizes BM25 using min-max scaling."""
    if min_bm25 == max_bm25:
        return 0.0
    return (bm25_score - min_bm25) / (max_bm25 - min_bm25)


def tokenize_and_lemmatize(text):
    """Tokenizes text into lowercase words without punctuation and lemmatizes words."""
    tokens = re.findall(r'\w+', text.lower())  # Tokenization
    return {lemmatizer.lemmatize(word) for word in tokens} - stop_words

def get_ngrams(text, n=2):
    """Generate n-grams from text"""
    tokens = list(tokenize_and_lemmatize(text))
    return set(zip(*[tokens[i:] for i in range(n)])) if len(tokens) >= n else set()

Step 5: Create Snorkel labelling functions

In [7]:

@labeling_function()
def bm25_score_prob(x):
    """Computes BM25 score and normalizes it to a probability."""
    global bm25
    bm25_score = max(bm25.get_scores(tokenize(x.query)))
    return normalize_bm25(bm25_score)


@labeling_function()
def boolean_match_prob(x):
    """Returns 1 if at least one query term appears in the document."""
    return 1.0 if set(tokenize(x.query)) & set(tokenize(x.unknown)) else 0.0


@labeling_function()
def word_level_levenshtein_prob(x):
    """Computes a boosted word-level Levenshtein similarity"""
    query_words = tokenize_and_lemmatize(x.query)
    doc_words = tokenize_and_lemmatize(x.unknown)

    if not query_words or not doc_words:
        return 0.0  # Avoid division by zero

    matched_scores = []
    for q in query_words:
        similarities = [textdistance.levenshtein.normalized_similarity(q, d) for d in doc_words]
        best_match = max(similarities) if similarities else 0.0
        matched_scores.append(best_match)

    # Compute final score
    avg_similarity = sum(matched_scores) / len(matched_scores) if matched_scores else 0.0
    boosted_score = avg_similarity * (1.5 if avg_similarity > 0.8 else 1.0)

    return min(boosted_score, 1.0) #Scores in range [0,1]

@labeling_function()
def relaxed_bigram_match_prob(x):
    """Computes the proportion of query bigrams where at least one word appears in the document."""
    query_bigrams = get_ngrams(x.query, n=2)
    doc_bigrams = get_ngrams(x.unknown, n=2)

    if not query_bigrams:
        return 0.0  # Avoid division by zero

    matched_bigrams = set()
    unmatched_bigrams = set()

    for q_bigram in query_bigrams:
        # If either word in the bigram appears in the document, count it as a partial match
        if any(word in x.unknown for word in q_bigram):
            matched_bigrams.add(q_bigram)
        else:
            unmatched_bigrams.add(q_bigram)
    return len(matched_bigrams) / len(query_bigrams)  # Fraction of bigrams with at least one match


@labeling_function()
def percent_matching_query_words(x):
    """Computes the percentage of query words that appear in the document."""

    if not x.query or not x.unknown:
        return 0.0  # Handle missing data

    # Tokenization: Split words and remove punctuation
    query_words = set(re.split(r'\W+', x.query.lower())) - stop_words
    doc_words = set(re.split(r'\W+', x.unknown.lower())) - stop_words

    if not query_words:  # Avoid division by zero
        return 0.0

    # Compute match percentage
    matching_words = query_words.intersection(doc_words)
    match_percent = len(matching_words) / len(query_words)  # Fraction of matched query words

    return match_percent  # Already in [0,1] range


# Load a strong BERT model for semantic similarity
bert_model = SentenceTransformer("all-mpnet-base-v2")  # Better than MiniLM for ranking tasks

@labeling_function()
def bert_cosine_similarity(x):
    """Computes boosted BERT cosine similarity as a probability of relevance."""

    if not x.query or not x.unknown:
        return 0.0  # Handle missing data

    # Encode query and document
    query_vec = bert_model.encode(x.query, convert_to_numpy=True)
    doc_vec = bert_model.encode(x.unknown, convert_to_numpy=True)

    # Handle cases where vectors are zero (to avoid division by zero)
    if np.linalg.norm(query_vec) == 0 or np.linalg.norm(doc_vec) == 0:
        return 0.0

    # Compute cosine similarity
    cosine_sim = np.dot(query_vec, doc_vec) / (np.linalg.norm(query_vec) * np.linalg.norm(doc_vec) + 1e-9)

    # Normalize cosine similarity to [0,1]
    probability = (cosine_sim + 1) / 2

    # Apply Boosting
    if probability > 0.8:
        probability *= 1.5  # Boost high similarities
    elif probability > 0.5:
        probability *= 1.2  # Moderate boost for medium similarities

    return min(probability, 1.0)  # Keep score within [0,1]



@labeling_function()
def query_term_coverage(x):
    """Computes a relaxed query term coverage using lemmatization and synonym matching."""

    if not x.query or not x.unknown:
        return 0.0  # Handle missing data

    # Tokenization: Extract words using regex
    query_tokens = [lemmatizer.lemmatize(word) for word in re.findall(r'\b\w+\b', x.query.lower())]
    doc_tokens = [lemmatizer.lemmatize(word) for word in re.findall(r'\b\w+\b', x.unknown.lower())]

    # Create word sets
    query_words = set(query_tokens) - stop_words  # Remove stopwords
    doc_words = set(doc_tokens)

    if not query_words:  # Avoid division by zero
        return 0.0

    # === Relaxed Matching: Check for Synonyms ===
    def word_matches(word):
        if word in doc_words:
            return True  # Exact match
        synonyms = {lemma.name() for syn in wordnet.synsets(word) for lemma in syn.lemmas()}  # Get synonyms
        return bool(synonyms.intersection(doc_words))  # Check if any synonym is in the doc

    covered_terms = sum(1 for word in query_words if word_matches(word))
    coverage = covered_terms / len(query_words)  # Fraction of query terms covered by document
    if coverage > 0.7:
        coverage *= 1.8  # Strong boost
    elif coverage > 0.5:
        coverage *= 1.5  # Moderate boost
    elif coverage > 0.2:
        coverage *= 1.2  # Mild boost

    coverage = min(coverage, 1.0)  # Keep within [0,1]
    return coverage


# Global TF-IDF vectorizer with n-grams for better lexical matching
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1)

@labeling_function()
def tfidf_cosine_similarity(x):
    """Computes a boosted TF-IDF cosine similarity for document relevance probability."""

    if not x.query or not x.unknown:
        return 0.0  # Handle missing data

    # Create TF-IDF matrix
    corpus = [x.query, x.unknown]
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

    # Ensure matrix is valid
    if tfidf_matrix.shape[0] < 2 or tfidf_matrix.nnz == 0:
        return 0.0  # Avoid invalid similarity computation

    # Compute cosine similarity
    tfidf_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0, 0]

    tfidf_prob = max(tfidf_sim, 0.0)

    if tfidf_prob > 0.7:
        tfidf_prob *= 1.8  # Strong boost
    elif tfidf_prob > 0.5:
        tfidf_prob *= 1.5  # Moderate boost
    elif tfidf_prob > 0.2:
        tfidf_prob *= 1.2  # Mild boost

    return min(tfidf_prob, 1.0)  # Keep within [0,1]




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Step 7: Train Snorkel

In [8]:

compute_global_bm25_stats(input_data)

lfs = [bm25_score_prob, boolean_match_prob, word_level_levenshtein_prob, tfidf_cosine_similarity, bert_cosine_similarity]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(input_data)

print("Unique Values in L_train:", np.unique(L_train))

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=100, log_freq=10)

train_prob = label_model.predict_proba(L_train)
train_prob_relevant = train_prob[:, 1]

input_data["probability_relevant"] = train_prob_relevant
print(input_data)

print(input_data["probability_relevant"].describe())



100%|██████████| 13/13 [00:08<00:00,  1.53it/s]


Unique Values in L_train: [0 1]


100%|██████████| 100/100 [00:00<00:00, 743.79epoch/s]

                                      id  \
0   32d23068-7440-4891-9958-42325f98a604   
1   cde83146-ac3e-4bc5-a959-f2006ac7b8de   
2   cb7b20d0-def6-46c4-ae44-a78f00b47735   
3   4a68c86f-64ea-4293-bda5-7a0130c13864   
4   3e550de5-a104-44ae-bc3f-7ab556cc1018   
5   e4b67a23-92ce-478a-887d-ca8170361145   
6   66fd5e47-1928-45fb-812f-9b32e3f87c65   
7   1fc0289f-2ac3-49ac-a612-535d2e43a378   
8   638bc1bc-c26d-46ae-b401-f8b0b4794d6f   
9   f474a0a7-3422-4fc2-aeb2-4af24d038622   
10  9919aa1d-97b9-4e3f-ac73-63c1109d2f30   
11  5e75f3c7-6494-47f0-9656-1738b00aadc7   
12  27b5fa4a-a82e-4b66-9e35-6d4db703b808   

                                           query  \
0                      who sings monk theme song   
1                      who sings monk theme song   
2                      who sings monk theme song   
3                      who sings monk theme song   
4                      who sings monk theme song   
5                      who sings monk theme song   
6                  




Step 8: Check the results for different functions

In [9]:
# Convert L_train to a DataFrame with LF names as columns
df_L = pd.DataFrame(L_train, columns=[lf.name for lf in lfs])
#print("Sample Labeling Function Outputs:\n", df_L.head())

lf_stats = df_L.describe().T  # Transpose so LFs are rows
#print("\nLabeling Function Statistics:\n", lf_stats)

Step 9: Use wows_evaluate

In [10]:
with Environment().measure() as tracked_experiment:
    # now we do the "computation"
    predictions = input_data.copy()

In [11]:
wows_evaluate(
    predictions,
    DATASET_ID,
    environment=tracked_experiment,
    upload=True,
    system_name='snorkel-pointwise',
    system_description='A snorkel pointwise approach'
)

Download: 6.13kiB [00:00, 8.03MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/wows-eval/pointwise-smoke-test-20250128-training/
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/7d32c6e8-0bcc-4872-865d-36c591762950


Unnamed: 0,system,tau_ap,kendall,spearman,pearson
0,snorkel-pointwise,0.605556,0.6,0.685714,0.685714
