# Feature Tests (AllSports)

This notebook exercises the three analytics features using the AllSports provider where possible:
1) Event tagging (adds `predicted_tags` to timeline items via `event.get` + `augment_tags`),
2) Player hot-streak analytics (`player.performance_analytics`),
3) Multimodal highlights extraction (`highlights.multimodal.extract`).

Checklist:
- [ ] Backend server running locally (FastAPI /collect endpoint).
- [ ] ALLSPORTS_API_KEY available in environment (or configured in backend).
- [ ] Replace placeholders (EVENT_ID, PLAYER_ID, YOUTUBE_URL) with real values if desired.

Assumptions:
- The backend `/collect` endpoint accepts an `intent` and an `args` dict.
- To prefer AllSports we pass an explicit provider hint in args (e.g. `provider: 'allsports'`). If your backend uses a different key, adjust the `collect` helper below.
- The notebook focuses on quick smoke tests; adjust timeouts and settings for longer tasks (e.g., multimodal extraction).

In [12]:
# Cell 2: Setup and helpers
import os, requests, json, pprint, sys
BASE = os.environ.get('BACKEND_BASE', 'http://127.0.0.1:8000')
ALLSPORTS_KEY = os.environ.get('ALLSPORTS_API_KEY')
pp = pprint.PrettyPrinter(indent=2)

def collect(intent, args=None, provider_hint='allsports', timeout=30):
    payload = {'intent': intent, 'args': args or {}}
    # Add provider hint in a couple of common keys so backend adapters can pick it up
    if provider_hint:
        payload['args'].setdefault('provider', provider_hint)
        payload['args'].setdefault('_provider', provider_hint)
    try:
        r = requests.post(f'{BASE}/collect', json=payload, timeout=timeout)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print('Collect error for', intent, '->', e, file=sys.stderr)
        return {'ok': False, 'error': str(e)}

# Quick backend health check
try:
    h = requests.get(f'{BASE}/health', timeout=5).json()
except Exception as e:
    h = {'ok': False, 'error': str(e)}
print('Backend health:', h)

Backend health: {'ok': True, 'service': 'Sports Collector HM (Unified)', 'version': '0.3.0'}


In [13]:
# Cell 3: Auto-discover sample IDs from the API (prefer AllSports)
from datetime import datetime, timedelta

print('Attempting to auto-discover sample EVENT_ID, TEAM_ID, PLAYER_ID and TEST_YT_URL from backend (AllSports preferred)')

EVENT_ID = None
TEAM_ID = None
PLAYER_ID = None
TEST_YT_URL = None

# 1) Try to fetch recent fixtures and pick the first event
try:
    end = datetime.utcnow().date()
    start = end - timedelta(days=14)
    args = {'from': start.isoformat(), 'to': end.isoformat()}
    fixtures_resp = collect('fixtures.list', args, provider_hint='allsports', timeout=20)
    print('fixtures.list ->', fixtures_resp.get('ok'))
    data = fixtures_resp.get('data') or fixtures_resp.get('result') or {}
    events = []
    if isinstance(data, list):
        events = data
    elif isinstance(data, dict):
        # common container keys
        for k in ('result','events','fixtures','data','results'):
            if isinstance(data.get(k), list) and data.get(k):
                events = data.get(k)
                break
        # object-of-arrays (AllSports) -> flatten
        if not events:
            vals = [v for v in data.values() if isinstance(v, list) and v]
            if vals:
                for v in vals:
                    events.extend(v)

    if events:
        ev = events[0]
        EVENT_ID = ev.get('event_key') or ev.get('idEvent') or ev.get('match_id') or ev.get('matchId') or ev.get('eventId')
        # try to get team ids/names
        TEAM_ID = ev.get('home_team_key') or ev.get('home_team_id') or ev.get('homeId') or ev.get('homeTeamId')
        home_name = ev.get('event_home_team') or ev.get('strHomeTeam') or ev.get('home_team')
        away_name = ev.get('event_away_team') or ev.get('strAwayTeam') or ev.get('away_team')
        print('Picked event sample:', EVENT_ID, home_name, 'vs', away_name)
    else:
        print('No fixtures found automatically')
except Exception as e:
    print('fixtures discovery error:', e)

# 2) Resolve team id if missing by calling team.get/team.get with teamName
if not TEAM_ID and (home_name or away_name):
    try:
        candidate = home_name or away_name
        tresp = collect('team.get', {'teamName': candidate}, provider_hint='allsports')
        print('team.get ->', tresp.get('ok'))
        tdata = tresp.get('data') or tresp.get('result') or {}
        team_obj = None
        if isinstance(tdata, dict) and isinstance(tdata.get('result'), list) and tdata.get('result'):
            team_obj = tdata['result'][0]
        elif isinstance(tdata, list) and tdata:
            team_obj = tdata[0]
        if team_obj:
            TEAM_ID = team_obj.get('team_key') or team_obj.get('team_id') or team_obj.get('idTeam')
            print('Resolved TEAM_ID ->', TEAM_ID)
    except Exception as e:
        print('team resolve error:', e)

# 3) Try to find a player for the resolved team
if TEAM_ID:
    try:
        presp = collect('players.list', {'teamId': TEAM_ID}, provider_hint='allsports')
        print('players.list ->', presp.get('ok'))
        pdata = presp.get('data') or presp.get('result') or {}
        players = []
        if isinstance(pdata, list):
            players = pdata
        elif isinstance(pdata, dict):
            for k in ('result','players','data','results'):
                if isinstance(pdata.get(k), list) and pdata.get(k):
                    players = pdata.get(k)
                    break
            if not players:
                vals = [v for v in pdata.values() if isinstance(v, list) and v]
                for v in vals:
                    players.extend(v)
        if players:
            p = players[0]
            PLAYER_ID = p.get('player_key') or p.get('player_id') or p.get('idPlayer') or p.get('playerId')
            print('Picked PLAYER_ID ->', PLAYER_ID)
    except Exception as e:
        print('players discovery error:', e)

# 4) Try to get a YouTube link from video.highlights for the event
if EVENT_ID:
    try:
        vresp = collect('video.highlights', {'eventId': EVENT_ID}, provider_hint='allsports')
        print('video.highlights ->', vresp.get('ok'))
        vdata = vresp.get('data') or vresp.get('result') or {}
        vids = []
        if isinstance(vdata, dict):
            for k in ('videos','results','data','result'):
                if isinstance(vdata.get(k), list) and vdata.get(k):
                    vids = vdata.get(k)
                    break
            if not vids:
                vals = [v for v in vdata.values() if isinstance(v, list) and v]
                for v in vals:
                    vids.extend(v)
        elif isinstance(vdata, list):
            vids = vdata
        if vids:
            first = vids[0]
            TEST_YT_URL = first.get('strYoutube') or first.get('url') or first.get('video_url') or first.get('link')
            print('Found video link ->', TEST_YT_URL)
    except Exception as e:
        print('video highlights discovery error:', e)

# Fallbacks: if we still don't have values, try broader probes
if not EVENT_ID:
    try:
        r = collect('events.live', {}, provider_hint='allsports')
        d = r.get('data') or r.get('result') or {}
        events = d if isinstance(d, list) else (d.get('result') or [])
        if events:
            ev = events[0]
            EVENT_ID = ev.get('event_key') or ev.get('idEvent')
    except Exception:
        pass

# Expose as variables for the rest of the notebook
print('\nDiscovered:')
print(' EVENT_ID =', EVENT_ID)
print(' TEAM_ID  =', TEAM_ID)
print(' PLAYER_ID=', PLAYER_ID)
print(' TEST_YT_URL =', TEST_YT_URL)

# Set in os.environ so later cells can pick them up
if EVENT_ID: os.environ['TEST_EVENT_ID'] = str(EVENT_ID)
if TEAM_ID: os.environ['TEST_TEAM_ID'] = str(TEAM_ID)
if PLAYER_ID: os.environ['TEST_PLAYER_ID'] = str(PLAYER_ID)
if TEST_YT_URL: os.environ['TEST_YT_URL'] = str(TEST_YT_URL)
print('\nEnvironment variables set for this session (not persisted).')

Attempting to auto-discover sample EVENT_ID, TEAM_ID, PLAYER_ID and TEST_YT_URL from backend (AllSports preferred)


  end = datetime.utcnow().date()


fixtures.list -> True
Picked event sample: 1661859 Wigan vs Salford
players.list -> True
Picked PLAYER_ID -> 1620413658
players.list -> True
Picked PLAYER_ID -> 1620413658
video.highlights -> True

Discovered:
 EVENT_ID = 1661859
 TEAM_ID  = 3123
 PLAYER_ID= 1620413658
 TEST_YT_URL = None

Environment variables set for this session (not persisted).
video.highlights -> True

Discovered:
 EVENT_ID = 1661859
 TEAM_ID  = 3123
 PLAYER_ID= 1620413658
 TEST_YT_URL = None

Environment variables set for this session (not persisted).


## 1) Event tagging (augment timeline tags)
Call `event.get` with `augment_tags=true` and inspect `timeline` items for `predicted_tags`. Replace EVENT_ID below or export `TEST_EVENT_ID` in environment.

In [14]:
# Cell 4: Event tagging test
EVENT_ID = os.environ.get('TEST_EVENT_ID', '1633104')  # replace with a real event id if available
resp = collect('event.get', {'eventId': EVENT_ID, 'augment_tags': True, 'model_path': os.environ.get('EVENT_TAG_MODEL_PATH')}, provider_hint='allsports')
print('\n== event.get response summary ==')
pp.pprint({k: resp.get(k) for k in ('ok','intent') if k in resp})
# try to find timeline in many possible shapes
timeline = []
data = resp.get('data') or resp.get('result') or {}
if isinstance(data, dict) and isinstance(data.get('result'), list) and len(data['result'])>0:
    first = data['result'][0]
    if isinstance(first, dict) and isinstance(first.get('timeline'), list):
        timeline = first['timeline']
elif isinstance(data, dict) and isinstance(data.get('timeline'), list):
    timeline = data['timeline']
elif isinstance(resp.get('result'), list) and len(resp['result'])>0 and isinstance(resp['result'][0], dict) and isinstance(resp['result'][0].get('timeline'), list):
    timeline = resp['result'][0]['timeline']

print(f'Found {len(timeline)} timeline items')
for t in timeline[:20]:
    minute = t.get('minute') or t.get('time') or t.get('timestamp')
    desc = t.get('description') or t.get('event') or t.get('text')
    tags = t.get('predicted_tags') or t.get('tags')
    print(minute, '->', (desc or '')[:80], ' | predicted_tags:', tags)


== event.get response summary ==
{'intent': 'event.get', 'ok': True}
Found 0 timeline items


## 2) Player hot-streak analytics
Call `player.performance_analytics` with `playerId` or `playerName`. The backend will use AllSports where applicable to fetch recent matches and compute a streak signal.

In [15]:
# Cell 6: Player analytics test
PLAYER_ID = os.environ.get('TEST_PLAYER_ID', '')
PLAYER_NAME = os.environ.get('TEST_PLAYER_NAME', '')
TEAM_ID = os.environ.get('TEST_TEAM_ID', '')
args = {'recent_games': 10}
if PLAYER_ID:
    args['playerId'] = PLAYER_ID
elif PLAYER_NAME:
    args['playerName'] = PLAYER_NAME
if TEAM_ID:
    args['teamId'] = TEAM_ID
resp = collect('player.performance_analytics', args, provider_hint='allsports')
print('\n== player.performance_analytics response ==')
pp.pprint(resp)
# Common fields to inspect
if resp.get('ok') and isinstance(resp.get('data') or resp.get('result'), dict):
    out = (resp.get('data') or resp.get('result'))
    print('\nSummary:')
    print('label:', out.get('label'))
    print('z_score:', out.get('z_score'))
    if out.get('recent_games'):
        print('recent_games_count:', len(out.get('recent_games')))


== player.performance_analytics response ==
{ 'args_resolved': { '_provider': 'allsports',
                     'playerId': '1620413658',
                     'provider': 'allsports',
                     'recent_games': 10,
                     'teamId': '3123'},
  'data': { 'analytics': { 'distribution': [0, 0],
                           'goals_per_game_all': 0.0,
                           'goals_per_game_recent': 0.0,
                           'label': 'NORMAL',
                           'recent_games': 2,
                           'recent_goals': 0,
                           'z_score': 0.0}},
  'intent': 'player.performance_analytics',
  'meta': { 'source': {'fallback': 'allsports', 'primary': 'tsdb'},
            'trace': [ { 'intent': 'player.performance_analytics',
                         'ok': False,
                         'provider': 'tsdb',
                         'step': 'primary'},
                       { 'intent': 'player.performance_analytics',
               

## 3) Multimodal highlight extraction
Call `highlights.multimodal.extract` with a YouTube URL to extract short clips and scoring metadata. For quick testing use a short clip or the first minute of a highlight video.

In [16]:
# Cell 8: Multimodal highlights test
YOUTUBE_URL = os.environ.get('TEST_YT_URL', '')
if not YOUTUBE_URL:
    print('No YOUTUBE_URL provided in env (TEST_YT_URL). Skipping multimodal extraction.\nSet TEST_YT_URL to run this cell.')
else:
    args = {'youtube_url': YOUTUBE_URL, 'clip_duration': int(os.environ.get('TEST_CLIP_DURATION', '15'))}
    print('Requesting multimodal extraction for', YOUTUBE_URL)
    multi = collect('highlights.multimodal.extract', args, provider_hint='allsports')
    pp.pprint({k: multi.get(k) for k in ('ok','intent') if k in multi})
    data = multi.get('data') or multi.get('result') or {}
    clips = data.get('clips') or data.get('results') or data.get('videos') or []
    print('\nClips found:', len(clips))
    for c in clips[:10]:
        print(c.get('path') or c.get('url') or c.get('video_url') or c.get('youtube') or c.get('strYoutube'), '->', c.get('scores') or c.get('score') or {})

No YOUTUBE_URL provided in env (TEST_YT_URL). Skipping multimodal extraction.
Set TEST_YT_URL to run this cell.


---
Run instructions:
1) Start the backend server (example):
```powershell
& .venvcriptsctivate.ps1; python run_server.py --port 8000
```
2) Optionally export environment variables (PowerShell examples):
```powershell
$env:ALLSPORTS_API_KEY = 'your_key'
$env:TEST_EVENT_ID = '1608398'
$env:TEST_PLAYER_ID = '535135'
$env:TEST_TEAM_ID = '1767'
$env:TEST_YT_URL = 'https://www.youtube.com/watch?v=...'
```
3) Run notebook cells in order. If a cell errors, open the corresponding 'Show raw' debug output in the frontend to inspect provider responses and paste them here for mapping updates.

# Automated Highlight Tagging — dataset collection and training

This section helps you collect timeline events from the backend, produce a small labeled dataset, train a simple text classifier (TF‑IDF + MultinomialNB) on event descriptions, and run inference to add `predicted_tags`.

Workflow:
1. Run the data collection cell to gather timeline items into `timeline_dataset_unlabeled.csv`.
2. Open that CSV, add a `label` column for a subset of rows (examples: HEADER_GOAL, FREE_KICK_GOAL, PENALTY_GOAL, COUNTER_ATTACK_GOAL, OWN_GOAL, YELLOW_CARD, RED_CARD, SUBSTITUTION). Save as `timeline_labeled.csv`.
3. Run the training cell to fit and save a model at `models/event_tag_model.pkl`.
4. Run the inference demo cell to apply the model to an event and see predicted tags.

Notes:
- The notebook re-uses the `collect` helper defined earlier in this notebook to call your backend `/collect` endpoint. Ensure your backend is running.
- The collection step will try to synthesize a timeline from common fields (scorers, goals, comments) when a provider doesn't supply a `timeline`.
- If scikit-learn is not installed in your environment, install it before running the training cell: `pip install scikit-learn pandas`.

In [None]:
# Collect timeline events from recent fixtures and save to CSV
import csv, os
from datetime import datetime, timedelta

OUT_DIR = 'data_collected'
os.makedirs(OUT_DIR, exist_ok=True)

def synth_event_timeline_from_resp(resp):
    # robust extraction of timeline from many shapes
    data = resp.get('data') or resp.get('result') or {}
    timeline = []
    if isinstance(data, dict) and isinstance(data.get('result'), list) and data.get('result'):
        first = data['result'][0]
        if isinstance(first, dict) and isinstance(first.get('timeline'), list):
            timeline = first['timeline']
    elif isinstance(data, dict) and isinstance(data.get('timeline'), list):
        timeline = data['timeline']
    elif isinstance(resp.get('result'), list) and len(resp['result'])>0 and isinstance(resp['result'][0], dict) and isinstance(resp['result'][0].get('timeline'), list):
        timeline = resp['result'][0]['timeline']
    # flatten AllSports object-of-arrays
    if not timeline and isinstance(data, dict):
        for v in data.values():
            if isinstance(v, list):
                for item in v:
                    if isinstance(item, dict) and ('timeline' in item or 'scorers' in item):
                        tl = item.get('timeline') or []
                        if tl:
                            timeline.extend(tl)
                        # synth from scorers
                        if not tl and item.get('scorers'):
                            for s in item.get('scorers'):
                                timeline.append({'minute': s.get('minute'), 'description': s.get('description') or s.get('text')})
    # fallback: if top-level dict contains timeline-like arrays
    if not timeline and isinstance(data, list):
        for item in data:
            if isinstance(item, dict) and item.get('timeline'):
                timeline.extend(item.get('timeline'))
    return timeline

COLLECT_DAYS = 30
end = datetime.utcnow().date()
start = end - timedelta(days=COLLECT_DAYS)
args = {'from': start.isoformat(), 'to': end.isoformat()}
fixtures_resp = collect('fixtures.list', args, provider_hint='allsports', timeout=30)
print('fixtures.list ok?', fixtures_resp.get('ok'))

# build dataset rows
rows = []
if fixtures_resp.get('ok'):
    data = fixtures_resp.get('data') or fixtures_resp.get('result') or {}
    # gather events list
    events = []
    if isinstance(data, list):
        events = data
    elif isinstance(data, dict):
        for k in ('result','events','fixtures','data','results'):
            if isinstance(data.get(k), list) and data.get(k):
                events = data.get(k)
                break
        if not events:
            vals = [v for v in data.values() if isinstance(v, list) and v]
            for v in vals:
                events.extend(v)
    for ev in events:
        # call event.get to get richer shape
        ev_id = ev.get('event_key') or ev.get('idEvent') or ev.get('match_id') or ev.get('eventId')
        if not ev_id:
            continue
        resp = collect('event.get', {'eventId': ev_id}, provider_hint='allsports', timeout=15)
        timeline = synth_event_timeline_from_resp(resp)
        if not timeline:
            # try to synthesize minimal timeline from scorers/summary
            tl = []
            if isinstance(ev.get('scorers'), list):
                for s in ev.get('scorers'):
                    tl.append({'minute': s.get('minute'), 'description': s.get('description') or s.get('text')})
            if tl:
                timeline = tl
        for t in timeline:
            minute = t.get('minute') or t.get('time') or ''
            desc = t.get('description') or t.get('event') or t.get('text') or ''
            rows.append({'event_id': ev_id, 'minute': minute, 'description': desc, 'raw': t})

OUT_CSV = os.path.join(OUT_DIR, 'timeline_dataset_unlabeled.csv')
with open(OUT_CSV, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['event_id','minute','description'])
    writer.writeheader()
    for r in rows:
        writer.writerow({'event_id': r['event_id'], 'minute': r['minute'], 'description': r['description']})

print('Wrote', len(rows), 'timeline rows to', OUT_CSV)


In [None]:
# Training cell: fit TF-IDF + MultinomialNB on labeled timeline data
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

LABELED_CSV = 'data_collected/timeline_labeled.csv'
MODEL_DIR = 'models'
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_PATH = os.path.join(MODEL_DIR, 'event_tag_model.pkl')

if not os.path.exists(LABELED_CSV):
    print('Labeled CSV not found at', LABELED_CSV)
else:
    df = pd.read_csv(LABELED_CSV)
    if 'label' not in df.columns:
        print('Please add a `label` column to', LABELED_CSV)
    else:
        df = df.dropna(subset=['description','label'])
        X = df['description'].astype(str)
        y = df['label'].astype(str)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        pipeline = make_pipeline(TfidfVectorizer(ngram_range=(1,2), max_features=10000), MultinomialNB())
        print('Training on', len(X_train), 'examples...')
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        print('Accuracy:', accuracy_score(y_test, preds))
        print(classification_report(y_test, preds))
        joblib.dump(pipeline, MODEL_PATH)
        print('Saved model to', MODEL_PATH)


In [None]:
# Inference / demo: load model and label a few sample timeline items
import joblib
MODEL_PATH = 'models/event_tag_model.pkl'
if not os.path.exists(MODEL_PATH):
    print('Model not found at', MODEL_PATH, '\nRun the training cell first.')
else:
    pipeline = joblib.load(MODEL_PATH)
    # load a few examples from the collected CSV
    import csv
    SAMPLE_IN = 'data_collected/timeline_dataset_unlabeled.csv'
    samples = []
    if os.path.exists(SAMPLE_IN):
        with open(SAMPLE_IN, newline='', encoding='utf-8') as f:
            r = csv.DictReader(f)
            for i, row in enumerate(r):
                samples.append(row['description'])
                if i>=9: break
    else:
        samples = [
            '45\' GOAL! Header from corner by John Doe',
            'Penalty scored by Smith from the spot',
            'Long-range strike from outside the box, unstoppable',
            'Own goal after a deflected cross'
        ]
    preds = pipeline.predict(samples)
    for s,p in zip(samples, preds):
        print(p, '->', s)

    # Example: call backend to augment an event using the trained model path
    # The backend agent will need to support a `model_path` argument. Example collect call:
    # resp = collect('event.get', {'eventId': '<id>', 'augment_tags': True, 'model_path': MODEL_PATH}, provider_hint='allsports')
    # pp.pprint(resp)
