# Comment Extraction Tests: Rule-based, NER, and Hybrid

This notebook tests and compares algorithms that extract structured match events from free-text comments (e.g., goals, cards, substitutions). It includes:
- Rule-based extraction using regex and spaCy Matcher
- Model-based extraction using spaCy NER + gazetteers
- A hybrid pipeline with conflict resolution
- Evaluation (precision/recall/F1), error analysis, and performance benchmarking


In [1]:
# 1) Configure Environment and Imports
import sys, os, json, re, math, time, random, pathlib, subprocess
from time import perf_counter

import numpy as np
import pandas as pd
from tqdm import tqdm

# Optional: spaCy & sklearn (will install if missing)
try:
    import spacy
    from spacy.matcher import Matcher
    from spacy.pipeline import EntityRuler
except ImportError:
    spacy = None
    Matcher = None
    EntityRuler = None

try:
    from sklearn.metrics import classification_report
except ImportError:
    classification_report = None

# Reproducibility
random.seed(42)
np.random.seed(42)

# Paths
WS = pathlib.Path.cwd().parents[1] if (pathlib.Path.cwd().name == 'notebooks') else pathlib.Path.cwd()
DATA_DIR = WS / 'sports-ai' / 'data'
OUT_DIR = WS / 'sports-ai' / 'share' / 'extraction_tests'
OUT_DIR.mkdir(parents=True, exist_ok=True)

print('Workspace:', WS)
print('Data dir :', DATA_DIR)
print('Out dir  :', OUT_DIR)

# Helper to ensure packages
def ensure_package(pkg, import_name=None):
    import importlib
    name = import_name or pkg
    try:
        importlib.import_module(name)
        return True
    except Exception:
        print(f'Installing {pkg}...')
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
        return True

# Optionally install spaCy small model
if spacy is None:
    ensure_package('spacy')
    import spacy
    from spacy.matcher import Matcher
    from spacy.pipeline import EntityRuler

# Load spaCy model (fallback to blank if missing)
try:
    nlp = spacy.load('en_core_web_sm')
except Exception:
    print('en_core_web_sm not found, using blank English model')
    nlp = spacy.blank('en')

print('spaCy pipeline:', nlp.pipe_names)


Workspace: c:\Users\lnipu\Projects\Sports-Analysis
Data dir : c:\Users\lnipu\Projects\Sports-Analysis\sports-ai\data
Out dir  : c:\Users\lnipu\Projects\Sports-Analysis\sports-ai\share\extraction_tests
Installing spacy...
en_core_web_sm not found, using blank English model
en_core_web_sm not found, using blank English model
spaCy pipeline: []
spaCy pipeline: []


## 2) Load and Inspect Comment Dataset

In [2]:
# Create a small sample comments DataFrame
sample_comments = [
    {"id": 1, "timestamp": "00:12:34", "text": "45' GOAL! Messi scores after a quick one-two. Barcelona lead 1-0."},
    {"id": 2, "timestamp": "00:14:10", "text": "Yellow card shown to Ramos for a late tackle."},
    {"id": 3, "timestamp": "00:17:02", "text": "Substitution: F. Valverde ON for L. Modric."},
    {"id": 4, "timestamp": "00:21:41", "text": "Red card! The goalkeeper is sent off for handling outside the box."},
    {"id": 5, "timestamp": "00:26:00", "text": "Another substitution: Haaland replaces Alvarez."},
    {"id": 6, "timestamp": "00:29:39", "text": "Corner for Real Madrid; well defended."},
    {"id": 7, "timestamp": "00:31:15", "text": "Offside flagged against Mbappé."},
]

df = pd.DataFrame(sample_comments)
df

Unnamed: 0,id,timestamp,text
0,1,00:12:34,45' GOAL! Messi scores after a quick one-two. ...
1,2,00:14:10,Yellow card shown to Ramos for a late tackle.
2,3,00:17:02,Substitution: F. Valverde ON for L. Modric.
3,4,00:21:41,Red card! The goalkeeper is sent off for handl...
4,5,00:26:00,Another substitution: Haaland replaces Alvarez.
5,6,00:29:39,Corner for Real Madrid; well defended.
6,7,00:31:15,Offside flagged against Mbappé.


## 3) Define Target Schemas and Label Taxonomy
We'll extract spans and events with the following labels: `PLAYER`, `TEAM`, `ACTION`, `CARD`, `SUBSTITUTION`, `SCORE`, `TIME`.

In [3]:
from dataclasses import dataclass, asdict
from typing import List, Optional, Tuple

LABELS = ["PLAYER", "TEAM", "ACTION", "CARD", "SUBSTITUTION", "SCORE", "TIME"]

@dataclass
class Span:
    start: int
    end: int
    label: str
    text: str

@dataclass
class Event:
    minute: Optional[str]
    type: str
    description: str
    player_in: Optional[str] = None
    player_out: Optional[str] = None
    player: Optional[str] = None
    team: Optional[str] = None
    tags: Optional[List[str]] = None


def validate_span(s: Span) -> bool:
    return 0 <= s.start < s.end and s.label in LABELS


def validate_event(e: Event) -> bool:
    return bool(e.type)

print('Labels:', LABELS)

Labels: ['PLAYER', 'TEAM', 'ACTION', 'CARD', 'SUBSTITUTION', 'SCORE', 'TIME']


## 4) Text Preprocessing Pipeline
Functions to normalize, tokenize, and sentence-split comments.

In [4]:
def normalize_text(s: str) -> str:
    s = s.replace('\u200b', ' ').strip()
    s = re.sub(r"\s+", " ", s)
    return s


def sentence_split(s: str) -> List[str]:
    s = normalize_text(s)
    parts = re.split(r"(?<=[\.!?])\s+", s)
    return [p for p in parts if p]


def tokenize(s: str) -> List[str]:
    return re.findall(r"\w+|[^\w\s]", s)


# Quick test
for t in df['text'].head(2):
    print(sentence_split(t))

["45' GOAL!", 'Messi scores after a quick one-two.', 'Barcelona lead 1-0.']
['Yellow card shown to Ramos for a late tackle.']


## 5) Gazetteers: Teams and Players
Load team and player lists if available; otherwise use a small demo list.

In [5]:
# Demo gazetteers; replace with real CSV/JSON as needed
teams = {"barcelona", "real madrid", "manchester city"}
players = {"lionel messi", "sergio ramos", "federico valverde", "luka modric", "erling haaland", "julian alvarez", "kylian mbappé"}

# Normalize helper
norm = lambda s: re.sub(r"\s+", " ", s.replace('.', '')).strip().lower()

team_set = {norm(t) for t in teams}
player_set = {norm(p) for p in players}

print('Teams:', team_set)
print('Players:', player_set)

Teams: {'barcelona', 'real madrid', 'manchester city'}
Players: {'erling haaland', 'luka modric', 'federico valverde', 'kylian mbappé', 'julian alvarez', 'lionel messi', 'sergio ramos'}


## 6) Rule-Based Extraction with Regex and spaCy Matcher
We implement regex patterns for scores, time markers, cards, and substitutions. We also add an optional spaCy Matcher for phrases.

In [6]:
TIME_RE = re.compile(r"(?:(\d+)(?:\+(\d+))?\s*')|\b(?:min(?:ute)?s?)\b", re.I)
SCORE_RE = re.compile(r"\b(\d{1,2})\s*[-:]\s*(\d{1,2})\b")
YELLOW_RE = re.compile(r"\byellow card\b|\bbooked\b", re.I)
RED_RE = re.compile(r"\bred card\b|\bsent off\b", re.I)
GOAL_RE = re.compile(r"\bgoal\b|\bscor(?:e|ed|es)\b|\bpenalty\b", re.I)
SUB_ON_FOR_RE = re.compile(r"\b(.+?)\s+(?:on|in)\s+for\s+(.+?)\b", re.I)
SUB_REPLACES_RE = re.compile(r"\b(.+?)\s+replaces\s+(.+?)\b", re.I)
REPLACED_BY_RE = re.compile(r"\b(.+?)\s+replaced\s+by\s+(.+?)\b", re.I)


def parse_minute(text: str) -> Optional[str]:
    m = TIME_RE.search(text)
    if not m: return None
    if m.group(1):
        base = int(m.group(1))
        extra = int(m.group(2) or 0)
        return f"{base}+{extra}" if extra else str(base)
    return None


def detect_tags(text: str) -> List[str]:
    t = text.lower()
    tags = []
    if GOAL_RE.search(t): tags.append('goal')
    if YELLOW_RE.search(t): tags.append('yellow card')
    if RED_RE.search(t): tags.append('red card')
    if SUB_ON_FOR_RE.search(t) or SUB_REPLACES_RE.search(t) or REPLACED_BY_RE.search(t): tags.append('substitution')
    if SCORE_RE.search(t): tags.append('score')
    return tags


def parse_substitution_players(text: str) -> Tuple[Optional[str], Optional[str]]:
    t = text.strip()
    m = SUB_ON_FOR_RE.search(t) or SUB_REPLACES_RE.search(t)
    if not m:
        m2 = REPLACED_BY_RE.search(t)
    else:
        m2 = None
    if m:
        return m.group(1).strip(), m.group(2).strip()
    if m2:
        return m2.group(2).strip(), m2.group(1).strip()
    return None, None


# Optional spaCy Matcher example (phrase-based for teams/players)
matcher = Matcher(nlp.vocab) if hasattr(nlp, 'vocab') else None
if matcher is not None and 'ner' in nlp.pipe_names:
    # Simple pattern: proper nouns followed by verb like 'scores' could be a GOAL trigger
    pattern = [{"POS": "PROPN", "OP": "+"}, {"LOWER": {"IN": ["scores", "scored"]}}]
    try:
        matcher.add("GOAL_PHRASE", [pattern])
    except Exception:
        pass

print('Rule patterns initialized')

Rule patterns initialized


## 7) Model-Based Extraction with spaCy NER
Run NER, then map model labels to target schema. Optionally add an EntityRuler with gazetteers.

In [7]:
# Build an EntityRuler with gazetteers to help NER
if 'entity_ruler' not in nlp.pipe_names:
    ruler = nlp.add_pipe('entity_ruler')
else:
    ruler = nlp.get_pipe('entity_ruler')

patterns = []
for t in team_set:
    patterns.append({"label": "ORG", "pattern": t})
for p in player_set:
    patterns.append({"label": "PERSON", "pattern": p})

try:
    ruler.add_patterns(patterns)
except Exception:
    pass

ner_label_map = {"PERSON": "PLAYER", "ORG": "TEAM"}

def spacy_extract_spans(text: str) -> List[Span]:
    doc = nlp(text)
    spans = []
    for ent in doc.ents:
        label = ner_label_map.get(ent.label_, None)
        if not label:
            continue
        spans.append(Span(start=ent.start_char, end=ent.end_char, label=label, text=ent.text))
    return spans

print('EntityRuler added with', len(patterns), 'patterns')

EntityRuler added with 10 patterns


## 8) Hybrid Extraction Pipeline and Conflict Resolution
Compose rule-based and model-based outputs. Provide a unified `extract(comment)` API.

In [8]:
def rule_based_extract_spans(text: str) -> List[Span]:
    spans: List[Span] = []
    # TIME
    for m in TIME_RE.finditer(text):
        s, e = m.span()
        spans.append(Span(s, e, 'TIME', text[s:e]))
    # SCORE
    for m in SCORE_RE.finditer(text):
        s, e = m.span()
        spans.append(Span(s, e, 'SCORE', text[s:e]))
    # CARDS
    if YELLOW_RE.search(text):
        s, e = YELLOW_RE.search(text).span()
        spans.append(Span(s, e, 'CARD', text[s:e]))
    if RED_RE.search(text):
        s, e = RED_RE.search(text).span()
        spans.append(Span(s, e, 'CARD', text[s:e]))
    # ACTION/GOAL
    if GOAL_RE.search(text):
        s, e = GOAL_RE.search(text).span()
        spans.append(Span(s, e, 'ACTION', text[s:e]))
    # SUBSTITUTION
    m = SUB_ON_FOR_RE.search(text) or SUB_REPLACES_RE.search(text) or REPLACED_BY_RE.search(text)
    if m:
        s, e = m.span()
        spans.append(Span(s, e, 'SUBSTITUTION', text[s:e]))
    return spans


def merge_spans(spans_a: List[Span], spans_b: List[Span]) -> List[Span]:
    out: List[Span] = []
    seen = set()
    for s in spans_a + spans_b:
        key = (s.start, s.end, s.label)
        if key in seen: continue
        # conflict resolution: prefer longer spans when overlapping
        conflict = False
        for i, t in enumerate(out):
            if not (s.end <= t.start or s.start >= t.end):
                # overlap – keep the longer span
                if (s.end - s.start) > (t.end - t.start):
                    out[i] = s
                conflict = True
                break
        if not conflict:
            out.append(s)
        seen.add(key)
    return sorted(out, key=lambda x: x.start)


def extract(text: str) -> Tuple[List[Span], Optional[Event]]:
    textN = normalize_text(text)
    minute = parse_minute(textN)
    tags = detect_tags(textN)
    spans_rules = rule_based_extract_spans(textN)
    spans_ner = spacy_extract_spans(textN)
    spans = merge_spans(spans_rules, spans_ner)

    # Derive a coarse event
    etype = None
    if 'substitution' in tags:
        etype = 'substitution'
    elif 'red card' in tags:
        etype = 'red card'
    elif 'yellow card' in tags:
        etype = 'yellow card'
    elif 'goal' in tags:
        etype = 'goal'
    elif 'score' in tags:
        etype = 'score'

    ev = None
    if etype:
        player_in = player_out = None
        if etype == 'substitution':
            player_in, player_out = parse_substitution_players(textN)
        ev = Event(minute=minute, type=etype, description=textN, player_in=player_in, player_out=player_out, tags=tags)

    return spans, ev

# Quick demo on sample comments
for _, row in df.iterrows():
    spans, ev = extract(row['text'])
    print('\nTEXT:', row['text'])
    print('SPANS:', [asdict(s) for s in spans])
    print('EVENT:', asdict(ev) if ev else None)


TEXT: 45' GOAL! Messi scores after a quick one-two. Barcelona lead 1-0.
SPANS: [{'start': 0, 'end': 3, 'label': 'TIME', 'text': "45'"}, {'start': 4, 'end': 8, 'label': 'ACTION', 'text': 'GOAL'}, {'start': 61, 'end': 64, 'label': 'SCORE', 'text': '1-0'}]
EVENT: {'minute': '45', 'type': 'goal', 'description': "45' GOAL! Messi scores after a quick one-two. Barcelona lead 1-0.", 'player_in': None, 'player_out': None, 'player': None, 'team': None, 'tags': ['goal', 'score']}

TEXT: Yellow card shown to Ramos for a late tackle.
SPANS: [{'start': 0, 'end': 11, 'label': 'CARD', 'text': 'Yellow card'}]
EVENT: {'minute': None, 'type': 'yellow card', 'description': 'Yellow card shown to Ramos for a late tackle.', 'player_in': None, 'player_out': None, 'player': None, 'team': None, 'tags': ['yellow card']}

TEXT: Substitution: F. Valverde ON for L. Modric.
SPANS: [{'start': 0, 'end': 34, 'label': 'SUBSTITUTION', 'text': 'Substitution: F. Valverde ON for L'}]
EVENT: {'minute': None, 'type': 'substi

## 9) Evaluation Dataset Split and Ground Truth Loader
For this demo, we will generate a tiny labeled subset. Replace with real annotations when available.

In [None]:
# Tiny ground truth (for demo).
# Format: id -> {type, minute, player_in, player_out}
GT = {
    1: {"type": "goal"},
    2: {"type": "yellow card"},
    3: {"type": "substitution", "player_in": "F. Valverde", "player_out": "L. Modric"},
    4: {"type": "red card"},
}

# Train/Val/Test split (toy)
ids = df['id'].tolist()
random.shuffle(ids)
train_ids = ids[:4]
val_ids = ids[4:6]
test_ids = ids[6:]
print('Splits:', {'train': train_ids, 'val': val_ids, 'test': test_ids})

Splits: {'train': [2, 4, 5, 3], 'val': [7, 1], 'test': [6]}


## 10) Evaluate Extraction Quality (P/R/F1)
Compute simple event-type accuracy and show a small report. Replace with span-level metrics as needed.

In [10]:
y_true, y_pred = [], []
for _, row in df[df['id'].isin(train_ids + val_ids + test_ids)].iterrows():
    gt = GT.get(row['id'])
    spans, ev = extract(row['text'])
    y_true.append(gt['type'] if gt else 'none')
    y_pred.append(ev.type if ev else 'none')

from collections import Counter
print('Counts true:', Counter(y_true))
print('Counts pred:', Counter(y_pred))

if classification_report:
    from sklearn.metrics import classification_report as _cr
    print(_cr(y_true, y_pred))
else:
    # Simple accuracy
    acc = np.mean([a == b for a, b in zip(y_true, y_pred)])
    print('Accuracy:', round(acc, 3))

Counts true: Counter({'none': 3, 'goal': 1, 'yellow card': 1, 'substitution': 1, 'red card': 1})
Counts pred: Counter({'substitution': 2, 'none': 2, 'goal': 1, 'yellow card': 1, 'red card': 1})
              precision    recall  f1-score   support

        goal       1.00      1.00      1.00         1
        none       1.00      0.67      0.80         3
    red card       1.00      1.00      1.00         1
substitution       0.50      1.00      0.67         1
 yellow card       1.00      1.00      1.00         1

    accuracy                           0.86         7
   macro avg       0.90      0.93      0.89         7
weighted avg       0.93      0.86      0.87         7



## 11) Error Analysis: FP/FN Drill-down
Show mismatches and save to CSV for review.

In [11]:
rows = []
for _, row in df.iterrows():
    gt = GT.get(row['id'], {}).get('type')
    spans, ev = extract(row['text'])
    pred = ev.type if ev else None
    if gt != pred:
        rows.append({
            'id': row['id'],
            'text': row['text'],
            'gt_type': gt,
            'pred_type': pred,
            'spans': [asdict(s) for s in spans]
        })

err_df = pd.DataFrame(rows)
err_df.to_csv(OUT_DIR / 'errors.csv', index=False)
err_df

Unnamed: 0,id,text,gt_type,pred_type,spans
0,5,Another substitution: Haaland replaces Alvarez.,,substitution,"[{'start': 0, 'end': 46, 'label': 'SUBSTITUTIO..."


## 12) Parameter Tuning and Ablation Experiments
Toggle regex/NER/gazetteers and compare metrics across configurations.

In [12]:
def evaluate_config(use_rules=True, use_ner=True):
    y_true, y_pred = [], []
    for _, row in df.iterrows():
        text = normalize_text(row['text'])
        gt = GT.get(row['id'])
        minute = parse_minute(text)
        tags = detect_tags(text)
        spans_rules = rule_based_extract_spans(text) if use_rules else []
        spans_ner = spacy_extract_spans(text) if use_ner else []
        spans = merge_spans(spans_rules, spans_ner)
        etype = None
        if 'substitution' in (tags if use_rules else []):
            etype = 'substitution'
        elif 'red card' in (tags if use_rules else []):
            etype = 'red card'
        elif 'yellow card' in (tags if use_rules else []):
            etype = 'yellow card'
        elif 'goal' in (tags if use_rules else []):
            etype = 'goal'
        elif 'score' in (tags if use_rules else []):
            etype = 'score'
        y_true.append(gt['type'] if gt else 'none')
        y_pred.append(etype or 'none')
    if classification_report:
        from sklearn.metrics import classification_report as _cr
        return _cr(y_true, y_pred, output_dict=True)
    else:
        acc = np.mean([a == b for a, b in zip(y_true, y_pred)])
        return {'accuracy': float(acc)}

print('Rules only:', evaluate_config(use_rules=True, use_ner=False))
print('NER only  :', evaluate_config(use_rules=False, use_ner=True))
print('Hybrid    :', evaluate_config(use_rules=True, use_ner=True))

Rules only: {'goal': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1.0}, 'none': {'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 3.0}, 'red card': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1.0}, 'substitution': {'precision': 0.5, 'recall': 1.0, 'f1-score': 0.6666666666666666, 'support': 1.0}, 'yellow card': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1.0}, 'accuracy': 0.8571428571428571, 'macro avg': {'precision': 0.9, 'recall': 0.9333333333333332, 'f1-score': 0.8933333333333333, 'support': 7.0}, 'weighted avg': {'precision': 0.9285714285714286, 'recall': 0.8571428571428571, 'f1-score': 0.8666666666666668, 'support': 7.0}}
NER only  : {'goal': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0}, 'none': {'precision': 0.42857142857142855, 'recall': 1.0, 'f1-score': 0.6, 'support': 3.0}, 'red card': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0}, 'substitution': {'precisi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 13) Performance Benchmarking
Measure throughput on a larger synthetic set.

In [13]:
big = df.sample(n=len(df), replace=True, random_state=42)
N = 2000
big = pd.concat([big]* (N // len(df) + 1)).head(N).reset_index(drop=True)

start = perf_counter()
for t in tqdm(big['text'], total=len(big)):
    extract(t)
sec = perf_counter() - start
print(f"Processed {len(big)} comments in {sec:.2f}s; {(len(big)/sec):.1f} comments/sec")

100%|██████████| 2000/2000 [00:00<00:00, 7079.71it/s]

Processed 2000 comments in 0.29s; 6876.8 comments/sec





## 14) Persist Outputs and Export
Write extracted events to JSONL for downstream use.

In [None]:
# out_path = OUT_DIR / 'extracted_events.jsonl'
# with open(out_path, 'w', encoding='utf-8') as f:
#     for _, row in df.iterrows():
#         spans, ev = extract(row['text'])
#         rec = {
#             'id': int(row['id']),
#             'text': row['text'],
#             'spans': [asdict(s) for s in spans],
#             'event': (asdict(ev) if ev else None)
#         }
#         f.write(json.dumps(rec, ensure_ascii=False) + '\n')
# print('Wrote', out_path)

Wrote c:\Users\lnipu\Projects\Sports-Analysis\sports-ai\share\extraction_tests\extracted_events.jsonl
