In [20]:
import pandas as pd
import json
from pathlib import Path

EVENTS_PATH = Path("../../data/processed/events.json")
REPORTS_PATH = Path("../../data/processed/reports.json")
OUTPUT_PATH = Path("../../data/processed/dataset.json")

events_df = pd.read_json(EVENTS_PATH)
reports_df = pd.read_json(REPORTS_PATH)

events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], errors='coerce')
reports_df['timestamp'] = pd.to_datetime(reports_df['timestamp'], errors='coerce')

# We asumme events.json and reports.json are already sorted
# events_df = events_df.sort_values('timestamp')
# reports_df = reports_df.sort_values('timestamp')

In [None]:
import re

N_EVENTS = 2

def unique_sentences(text):
    if not text:
        return ""
    
    # Use regex to split on ., !, ? followed by space or end of string
    raw_sentences = re.split(r'(?<=[.!?;])\s+(?=[A-ZŠŽČĆĐ])', text.strip())

    seen = set()
    unique = []

    for s in raw_sentences:
        normalized = s.strip().rstrip('.!?').strip().lower()
        if normalized and normalized not in seen:
            seen.add(normalized)
            unique.append(s.strip())

    return " ".join(unique)

def get_last_n_events(n, before_time):
    filtered = events_df[events_df['timestamp'] < before_time].tail(n)
    combined_texts = []

    for _, row in filtered.iterrows():
        notice = str(row.get('priority_notices', '')).strip()
        summary = str(row.get('general_summary', '')).strip()

        if notice:
            combined = f"{notice}. {summary}" if summary else notice
        else:
            combined = summary

        combined_texts.append(combined)

    return "\n".join(combined_texts)

training_dataset = []
for _, report in reports_df.iterrows():
    event_text = get_last_n_events(N_EVENTS, report['timestamp'])
    cleaned_input = unique_sentences(event_text)
    cleaned_output = report['report'].strip()
    if cleaned_input and cleaned_output:
        training_dataset.append({
            "input": cleaned_input,
            "groundtruth": cleaned_output
        })

training_dataset[0]



{'input': 'Vreme Ponekod po Sloveniji megla v pasovih zmanjšuje vidljivost. Prilagodite hitrost! Omejitve za tovorna vozila Po Sloveniji velja med prazniki omejitev za tovorna vozila z največjo dovoljeno maso nad 7,5 ton: - danes, 1. 1., od 8. do 22. ure; - v nedeljo, 2. 1., od 8. do 22. ure. Od 30. decembra je v veljavi sprememba omejitve za tovorna vozila nad 7,5 ton. Več. Dela Na primorski avtocesti je ponovno odprt priključek Črni Kal v obe smeri. Omejitve za tovorna vozila Po Sloveniji velja med prazniki omejitev za tovorna vozila z največjo dovoljeno maso nad 7,5 ton: - danes, od 8. do 22. ure; - v nedeljo, 2. 1., od 8. do 22. ure.',
 'groundtruth': 'Podatki o prometu.\nPonekod po državi megla zmanjšuje vidljivost.\nZaradi del je na vzhodni mariborski obvoznici v obe smeri zaprt prehitevalni pas med razcepom Dragučova in priključkom Pesnica.\n\x00'}

In [25]:
import json

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(training_dataset, f, ensure_ascii=False, indent=2)

print(f"Exported {len(training_dataset)} samples to {OUTPUT_PATH}")

Exported 27377 samples to ..\..\data\processed\dataset.json
