In [13]:
import re
import wandb
from datasets import load_dataset
from snorkel.labeling import labeling_function, LFAnalysis, PandasLFApplier
from snorkel.labeling.model import MajorityLabelVoter
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import sys

# Snorkel Label Mapping
# 0: ABSTAIN (No label)
# 1-4: The four entity types (simplified)
ABSTAIN = 0
ORG = 1
MISC = 2
PER = 3
LOC = 4

# Data Loading (Using the stable Parquet revision)
dataset_name = "conll2003"
parquet_revision = "refs/convert/parquet"
print("Loading dataset...")
raw_datasets = load_dataset(dataset_name, revision=parquet_revision)
TAG_NAMES = raw_datasets['train'].features['ner_tags'].feature.names
print(f"Dataset loaded successfully: {raw_datasets}")

# Helper Function for Snorkel Data Format
def prepare_snorkel_data(data_split):
    records = []
    for example in data_split:
        tokens = example['tokens']
        ner_tags = example['ner_tags']
        for token, tag_id in zip(tokens, ner_tags):
            tag_name = TAG_NAMES[tag_id]
            target_label = ABSTAIN 
            if 'PER' in tag_name:
                target_label = PER
            elif 'LOC' in tag_name:
                target_label = LOC
            elif 'ORG' in tag_name:
                target_label = ORG
            elif 'MISC' in tag_name:
                target_label = MISC
            records.append({'token': token, 'target': target_label, 'token_text': token})
    return pd.DataFrame(records)

# Prepare the dataframes
df_dev = prepare_snorkel_data(raw_datasets['validation'])
df_train = prepare_snorkel_data(raw_datasets['train'])
print(f"Prepared {len(df_dev)} dev tokens and {len(df_train)} train tokens.")

Loading dataset...
Dataset loaded successfully: DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Prepared 51362 dev tokens and 203621 train tokens.


In [14]:
# 1. Initialize Weights & Biases Project
wandb.init(project="Q1-weak-supervision-ner")
print("W&B project initialized for Q1, Q2, and Q3 metrics.")

# 2. Calculate and Log Dataset Statistics

# Calculate Sample Counts

dataset_stats = {
    'train_samples': len(raw_datasets['train']),
    'validation_samples': len(raw_datasets['validation']),
    'test_samples': len(raw_datasets['test']),
}
dataset_stats['total_samples'] = sum(dataset_stats.values())
print("\n[Q1] Dataset Sample Counts:", dataset_stats)

# Calculate Entity Distribution
entity_counts = {}
for split in raw_datasets:
    for tags in raw_datasets[split]['ner_tags']:
        for tag_id in tags:
            tag_name = TAG_NAMES[tag_id]
            if tag_name.startswith(('B-', 'I-')):
                entity_type = tag_name.split('-')[-1]
                entity_counts[entity_type] = entity_counts.get(entity_type, 0) + 1

entity_df = pd.DataFrame(
    list(entity_counts.items()),
    columns=['Entity Type', 'Count']
).sort_values(by='Count', ascending=False)
print("[Q1] Entity Distribution:")
print(entity_df)

# Log Statistics to W&B
wandb.run.summary.update(dataset_stats)
print("Logged dataset sample counts to W&B summary.")
entity_table = wandb.Table(dataframe=entity_df)
wandb.log({"Q1/Entity Distribution": entity_table})
print("Logged entity distribution table to W&B run.")

W&B project initialized for Q1, Q2, and Q3 metrics.

[Q1] Dataset Sample Counts: {'train_samples': 14041, 'validation_samples': 3250, 'test_samples': 3453, 'total_samples': 20744}
[Q1] Entity Distribution:
  Entity Type  Count
2         PER  17050
0         ORG  14613
3         LOC  12316
1        MISC   6779
Logged dataset sample counts to W&B summary.
Logged entity distribution table to W&B run.


In [15]:
# 1. Implement Labeling Functions

# a. LF for Years (MISC)
YEAR_REGEX = re.compile(r'\b(19|20)\d{2}\b')
@labeling_function()
def lf_year_as_misc(x):
    """Labels four-digit numbers (1900-2099) as MISC."""
    if YEAR_REGEX.match(x.token_text):
        return MISC
    return ABSTAIN
print("[Q2a] LF for Years/MISC defined.")

# b. LF for Organization Suffixes (ORG)
ORG_SUFFIXES = ['Inc.', 'Corp.', 'Ltd.', 'Group', 'Co.', 'S.A.', 'A.G.', 'B.V.']
@labeling_function()
def lf_org_suffix(x):
    """Labels tokens ending with common organizational suffixes as ORG."""
    if x.token_text.strip().endswith(tuple(ORG_SUFFIXES)):
        return ORG
    return ABSTAIN
print("[Q2b] LF for Organization Suffixes/ORG defined.")

# 2. Apply LFs and Prepare for Analysis
lfs = [lf_year_as_misc, lf_org_suffix]
applier = PandasLFApplier(lfs=lfs)
L_dev = applier.apply(df=df_dev)
Y_dev = df_dev.target.values

# 3. Calculate and Log LF Metrics (Robust Manual Fix for API issues)
print("[Q2] Calculating and Logging LF Metrics (Coverage & Accuracy)")

for i, lf in enumerate(lfs):
    # Determine the LF name safely
    lf_name_str = lf.name if hasattr(lf, 'name') else lf.f.__name__
    
    L_i = L_dev[:, i]
    
    # Coverage: tokens where LF did NOT abstain
    coverage = (L_i != ABSTAIN).sum() / len(L_i)
    
    # Accuracy: tokens where LF labeled correctly (excluding abstains)
    covered_indices = L_i != ABSTAIN
    Y_true_covered = Y_dev[covered_indices]
    L_i_covered = L_i[covered_indices]
    
    accuracy = accuracy_score(Y_true_covered, L_i_covered) if len(Y_true_covered) > 0 else 0.0

    # Log to W&B
    wandb.log({
        f"Q2/LF_Metrics/{lf_name_str}_Coverage": coverage,
        f"Q2/LF_Metrics/{lf_name_str}_Accuracy": accuracy
    })
    
    print(f"  {lf_name_str}: Coverage={coverage:.4f}, Accuracy={accuracy:.4f} (Logged to W&B)")

print("[Q2] Successfully logged coverage and accuracy for both LFs.")


[Q2a] LF for Years/MISC defined.
[Q2b] LF for Organization Suffixes/ORG defined.


100%|█████████████████████████████████████████████████████████████████████████| 51362/51362 [00:01<00:00, 40612.17it/s]


[Q2] Calculating and Logging LF Metrics (Coverage & Accuracy)
  lf_year_as_misc: Coverage=0.0073, Accuracy=0.0187 (Logged to W&B)
  lf_org_suffix: Coverage=0.0004, Accuracy=0.8947 (Logged to W&B)
[Q2] Successfully logged coverage and accuracy for both LFs.


In [16]:
# 1. Apply LFs to the Training Set
L_train = applier.apply(df=df_train)
print("[Q3] Applied LFs to the training set.")

# 2. Implement Label Aggregation (Majority Label Voter)
voter = MajorityLabelVoter(cardinality=5) 
Y_train_pred = voter.predict(L=L_train)
print(f"[Q3] Generated {len(Y_train_pred)} aggregated weak labels for the training set.")

# 3. Evaluate and Log Aggregation Metrics
Y_train_true = df_train.target.values

# Coverage
coverage = (Y_train_pred != ABSTAIN).sum() / len(Y_train_pred)

# Accuracy (on covered tokens)
covered_indices = Y_train_pred != ABSTAIN
Y_pred_covered = Y_train_pred[covered_indices]
Y_true_covered = Y_train_true[covered_indices]
accuracy = accuracy_score(Y_true_covered, Y_pred_covered)

print(f"[Q3] Majority Label Voter Results (Training Set)")
print(f"Coverage (Labeled Tokens): {coverage:.4f}")
print(f"Accuracy (on Covered Tokens): {accuracy:.4f}")

# Log Aggregation Metrics to W&B
wandb.log({
    "Q3/Voter_Coverage": coverage,
    "Q3/Voter_Accuracy": accuracy
})

# 4. Finish the W&B run (must be done only once at the end)
wandb.finish()
print("W&B run finished.")

100%|███████████████████████████████████████████████████████████████████████| 203621/203621 [00:05<00:00, 34753.52it/s]


[Q3] Applied LFs to the training set.


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[Q3] Generated 203621 aggregated weak labels for the training set.
[Q3] Majority Label Voter Results (Training Set)
Coverage (Labeled Tokens): 0.0076
Accuracy (on Covered Tokens): 0.0000


0,1
Q2/LF_Metrics/lf_org_suffix_Accuracy,▁
Q2/LF_Metrics/lf_org_suffix_Coverage,▁
Q2/LF_Metrics/lf_year_as_misc_Accuracy,▁
Q2/LF_Metrics/lf_year_as_misc_Coverage,▁
Q3/Voter_Accuracy,▁
Q3/Voter_Coverage,▁

0,1
Q2/LF_Metrics/lf_org_suffix_Accuracy,0.89474
Q2/LF_Metrics/lf_org_suffix_Coverage,0.00037
Q2/LF_Metrics/lf_year_as_misc_Accuracy,0.01867
Q2/LF_Metrics/lf_year_as_misc_Coverage,0.0073
Q3/Voter_Accuracy,0.0
Q3/Voter_Coverage,0.00762
test_samples,3453.0
total_samples,20744.0
train_samples,14041.0
validation_samples,3250.0


W&B run finished.
