In [5]:
import os
import re
import json
from docx import Document

# function to split text into sentences
def divide_into_sentences(text):
    """
    Splits the input text into sentences based on punctuation.
    
    Args:
        text (str): The text to split into sentences.
        
    Returns:
        List[str]: A list of sentences.
    """
    sentences = []
    sentence = ""
    
    for char in text:
        sentence += char
        if char in ".!?":  # sentence-ending punctuation
            if len(sentence.strip()) > 1:  # avoid single-character sentences
                sentences.append(sentence.strip())
                sentence = ""
    
    if sentence.strip():  # add any remaining text as the last sentence
        sentences.append(sentence.strip())
    
    return sentences

# function to check if a sentence is valid
def is_valid_sentence(sentence):
    """
    Determines if a sentence is valid based on the question criteria.
    
    Args:
        sentence (str): The sentence to validate.
        
    Returns:
        bool: True if the sentence is valid, False otherwise.
    """
    # check if the sentence contains Hebrew characters
    if not re.search(r"[\u0590-\u05FF]", sentence):
        return False
    
    # exclude sentences with only special characters
    if re.fullmatch(r"[^\w\u0590-\u05FF]+", sentence):
        return False
    
    # avoid sentences with placeholder symbols
    if re.search(r"\.\.\.|---", sentence):
        return False
    
    return True

# function to tokenize a sentence into words and symbols
def tokenize_sentence(sentence):
    """
    Tokenizes a sentence into words and punctuation marks.
    
    Args:
        sentence (str): The sentence to tokenize.
        
    Returns:
        List[str]: A list of tokens.
    """
    tokens = []
    for word in sentence.split():  # split the sentence into words
        # split words into smaller tokens if necessary
        tokens.extend(re.findall(r"\w+|[^\w\s]", word))
    return tokens

# main processing script
def process_protocol_files(folder_path):
    """
    Processes .docx files in a given folder, extracting and tokenizing Hebrew text.
    
    Args:
        folder_path (str): Path to the folder containing .docx files.
    """
    # find all .docx files in the folder
    protocol_files = [f for f in os.listdir(folder_path) if f.endswith(".docx")]
    jsonl_data = []

    print("Found files:", protocol_files)
    
    for file in protocol_files:
        match = re.search(r'(\d+)_pt', file)
        knesset_number = int(match.group(1)) if match else -1  # extract Knesset number
        
        # determine the protocol type based on file name
        if "ptm" in file:
            protocol_type = "plenary"
        elif "ptv" in file:
            protocol_type = "committee"
        else:
            protocol_type = "undefined"
        
        protocol_number = None
        
        try:
            doc_path = os.path.join(folder_path, file)
            doc = Document(doc_path)  # open the document
            
            # extract the protocol number from the first few paragraphs
            for paragraph in doc.paragraphs[:10]:
                match = re.search(r"פרוטוקול מס'? (\d+)", paragraph.text)
                if match:
                    protocol_number = int(match.group(1))
                    break
            
            if protocol_number is None:
                protocol_number = -1  # if not found then the number is -1
            
            last_speaker = None
            
            # process the document paragraph by paragraph
            for paragraph in doc.paragraphs:
                text = paragraph.text.strip()  # remove extra whitespace
                
                if not text:  # skip empty paragraphs
                    continue
                
                # check if the paragraph starts with a speaker name
                speaker_match = re.match(r"^([\u0590-\u05FF\w\s\(\)]+):", text)
                if speaker_match:
                    raw_name = speaker_match.group(1)  # extract the speaker's name
                    name = re.sub(r"\s*\(.*?\)", "", raw_name).strip()  # remove the extra info
                    spoken_text = text[len(speaker_match.group(0)):].strip()
                    last_speaker = name  # Update the last speaker
                    
                    # split the spoken text into sentences
                    sentences = divide_into_sentences(spoken_text)
                    
                    for sentence in sentences:
                        if is_valid_sentence(sentence):
                            tokens = tokenize_sentence(sentence)
                            if len(tokens) >= 4:  # store only tokens with length more or equals to 4
                                jsonl_data.append({
                                    "protocol_name": file,
                                    "knesset_number": knesset_number,
                                    "protocol_type": protocol_type,
                                    "protocol_number": protocol_number,
                                    "speaker_name": name,
                                    "sentence_text": " ".join(tokens)
                                })
                elif last_speaker:
                    # if no speaker is found, then the text belongs to the last speaker
                    additional_sentences = divide_into_sentences(text)
                    for sentence in additional_sentences:
                        if is_valid_sentence(sentence):
                            tokens = tokenize_sentence(sentence)
                            if len(tokens) >= 4:
                                jsonl_data.append({
                                    "protocol_name": file,
                                    "knesset_number": knesset_number,
                                    "protocol_type": protocol_type,
                                    "protocol_number": protocol_number,
                                    "speaker_name": last_speaker,
                                    "sentence_text": " ".join(tokens)
                                })
        
        except Exception as e:
            print(f"Error processing file {file}: {e}")
    
    # save the results to a JSONL file
    output_file = os.path.join(folder_path, "protocol_data.jsonl")
    with open(output_file, "w", encoding="utf-8") as jsonl_file:
        for entry in jsonl_data:
            jsonl_file.write(json.dumps(entry, ensure_ascii=False) + "\n")
    
    print(f"JSONL file saved at: {output_file}")

# define the folder path and start processing
if __name__ == "__main__":
    folder_path = "C:/Users/Alpha/Downloads/knesset_protocols/protocol_for_hw1/"
    process_protocol_files(folder_path)


Found files: ['13_ptm_532058.docx', '13_ptm_532066.docx', '13_ptm_532240.docx', '13_ptm_532389.docx', '14_ptm_532484.docx', '14_ptm_532608.docx', '14_ptm_532731.docx', '15_ptm_532756.docx', '15_ptm_532855.docx', '15_ptm_533086.docx', '15_ptv_490845.docx', '15_ptv_490916.docx', '15_ptv_494321.docx', '15_ptv_494780.docx', '15_ptv_495206.docx', '15_ptv_495295.docx', '15_ptv_496915.docx', '15_ptv_496944.docx', '15_ptv_498215.docx', '16_ptm_128954.docx', '16_ptm_128968.docx', '16_ptm_129080.docx', '16_ptm_129137.docx', '16_ptm_129202.docx', '16_ptm_533215.docx', '16_ptm_533607.docx', '16_ptv_386758.docx', '16_ptv_386822.docx', '16_ptv_386833.docx', '16_ptv_489839.docx', '16_ptv_491962.docx', '16_ptv_493376.docx', '16_ptv_499021.docx', '16_ptv_499045.docx', '16_ptv_548123.docx', '16_ptv_548530.docx', '16_ptv_549100.docx', '16_ptv_572718.docx', '16_ptv_577443.docx', '16_ptv_577758.docx', '16_ptv_581836.docx', '16_ptv_71595.docx', '17_ptm_129748.docx', '17_ptm_533398.docx', '17_ptm_533401.docx