In [2]:
import pandas as pd
from docx import Document
import os
import re
import json

def divide_into_sentences(text):
    sentences = []
    sentence = ""

    for char in text:
        sentence += char
        if char in ".!?:":
            if len(sentence.strip()) > 1:
                sentences.append(sentence.strip())
                sentence = ""

    if sentence.strip():
        sentences.append(sentence.strip())

    return sentences

def is_valid_sentence(sentence):
    if not re.search(r"[\u0590-\u05FF]", sentence):
        return False

    if re.fullmatch(r"[^\w\u0590-\u05FF]+", sentence):
        return False

    if re.search(r"\.\.\.|- - -", sentence):
        return False

    return True

def tokenize_sentence(sentence):
    abbreviation_pattern = re.compile(r"\b(?:e\.g\.|i\.e\.|Mr\.|Mrs\.|Dr\.|Prof\.)\b", re.IGNORECASE)
    
    abbreviations = {}
    for match in abbreviation_pattern.findall(sentence):
        placeholder = f"{{abbrev_{len(abbreviations)}}}"
        abbreviations[placeholder] = match
        sentence = sentence.replace(match, placeholder)

    tokens = []
    for word in sentence.split():
        tokens.extend(re.findall(r"\w+|[^\w\s]", word))

    tokens = [abbreviations.get(token, token) for token in tokens]

    return tokens

folder_path = "C:/Users/Alpha/Downloads/knesset_protocols/protocol_for_hw1/"
protocol_files = [f for f in os.listdir(folder_path) if f.endswith(".docx")]

jsonl_data = []

print("found files:")
for file in protocol_files:
    match = re.search(r'(\d+)_pt', file)
    knesset_number = int(match.group(1)) if match else -1

    if "ptm" in file:
        protocol_type = "plenary"
    elif "ptv" in file:
        protocol_type = "committee"
    else:
        protocol_type = "undefined"

    protocol_number = None 

    try:
        doc_path = os.path.join(folder_path, file)
        doc = Document(doc_path)

        for paragraph in doc.paragraphs[:10]:
            match = re.search(r"פרוטוקול מס'? (\d+)", paragraph.text)
            if match:
                protocol_number = int(match.group(1))
                break
        if protocol_number is None:
            protocol_number = -1

        last_speaker = None
        for paragraph in doc.paragraphs:
            text = paragraph.text.strip()

            if not text:
                continue

            speaker_match = re.match(r"^([\u0590-\u05FF\w\s\(\)]+):", text)
            if speaker_match:
                raw_name = speaker_match.group(1)
                name = re.sub(r"\s*\(.*?\)", "", raw_name).strip()
                name = re.sub(r"^(Chairman|Dr\.|Mr\.|Ms\.|Mrs\.)\s*", "", name)

                spoken_text = text[len(speaker_match.group(0)):].strip()
                last_speaker = name

                sentences = divide_into_sentences(spoken_text)

                for sentence in sentences:
                    if is_valid_sentence(sentence):
                        tokens = tokenize_sentence(sentence)
                        if len(tokens) >= 4:
                            jsonl_data.append({
                                "protocol_name": file,
                                "knesset_number": knesset_number,
                                "protocol_type": protocol_type,
                                "protocol_number": protocol_number,
                                "speaker_name": name,
                                "sentence_text": " ".join(tokens)
                            })
            elif last_speaker:
                additional_sentences = divide_into_sentences(text)
                for sentence in additional_sentences:
                    if is_valid_sentence(sentence):
                        tokens = tokenize_sentence(sentence)
                        if len(tokens) >= 4:
                            jsonl_data.append({
                                "protocol_name": file,
                                "knesset_number": knesset_number,
                                "protocol_type": protocol_type,
                                "protocol_number": protocol_number,
                                "speaker_name": last_speaker,
                                "sentence_text": " ".join(tokens)
                            })

    except Exception as e:
        print(f"Error processing file {file}: {e}")

output_file = "C:/Users/Alpha/Downloads/knesset_protocols/protocol_data.jsonl"
with open(output_file, "w", encoding="utf-8") as jsonl_file:
    for entry in jsonl_data:
        jsonl_file.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"JSONL file saved at: {output_file}")


found files:
JSONL file saved at: C:/Users/Alpha/Downloads/knesset_protocols/protocol_data.jsonl
