In [None]:
import json
import re
from transformers import AutoTokenizer

# Load tokenizer for IBM Granite model
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")  # Local fallback for syntax compatibility

# --- Helpers ---

def regex_sent_tokenize(text):
    """Split text into sentences using punctuation boundaries."""
    sentences = re.split(r'(?<=[.!?])(?:\s+|\n+)', text)
    return [s.strip() for s in sentences if s.strip()]

def tokenizer_safe_chunking(text, max_tokens=450):
    """
    Chunk text by tokens using the tokenizer (reserving 2 tokens for special tokens).
    Tries to split at sentence boundaries.
    """
    sentences = regex_sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
        token_len = len(sentence_tokens)

        # If adding this sentence would overflow, save current chunk
        if current_tokens + token_len > max_tokens and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_tokens = token_len
        else:
            current_chunk += " " + sentence if current_chunk else sentence
            current_tokens += token_len

    if current_chunk:
        chunks.append(current_chunk.strip())

    # Final safety pass: trim any chunk still too long after decode
    safe_chunks = []
    for chunk in chunks:
        token_ids = tokenizer.encode(chunk, add_special_tokens=True)
        if len(token_ids) <= 450:
            safe_chunks.append(chunk)
        else:
            # Hard split the long chunk into smaller ones
            split_ids = tokenizer.encode(chunk, add_special_tokens=False)
            for i in range(0, len(split_ids), max_tokens):
                part_ids = split_ids[i:i+max_tokens]
                safe_chunk = tokenizer.decode(part_ids, skip_special_tokens=True)
                safe_chunks.append(safe_chunk.strip())

    return safe_chunks

with open("data/english_courses.json", "r", encoding="utf-8") as f:
    data = json.load(f)

chunked_data = []

for entry in data:
    description = str(entry.get("description", "")).strip()
    if not description:
        continue

    chunks = tokenizer_safe_chunking(description)

    for idx, chunk in enumerate(chunks, start=1):
        new_entry = {
            "chunk_number": idx,
            "chunk_text": chunk
        }
        # Include original metadata
        for key in ["programme", "course_title", "url", "language", "ects", "type", "level", "study_board"]:
            new_entry[key] = entry.get(key, "")

        # Final token safety check
        token_length = len(tokenizer.encode(chunk, add_special_tokens=True))
        if token_length > 450:
            print(f"Still too long ({token_length} tokens): {entry.get('course_title', 'Unknown')}")
            continue

        chunked_data.append(new_entry)

# --- Save result ---
with open("data/chunked_courses.json", "w", encoding="utf-8") as f:
    json.dump(chunked_data, f, ensure_ascii=False, indent=2)

print(f"Created {len(chunked_data)} chunks and saved to 'chunked_courses.json'.")
