In [1]:
import pandas as pd
from docx import Document
import os
import re

def divide_into_sentences(text):
    """
    Divides a block of text into sentences based on basic punctuation rules.
    Handles Hebrew and English sentence delimiters.
    """
    sentences = []
    sentence = ""

    for char in text:
        sentence += char
        if char in ".!?:":
            if len(sentence.strip()) > 1:
                sentences.append(sentence.strip())
                sentence = ""

    if sentence.strip():
        sentences.append(sentence.strip())

    return sentences

def is_valid_sentence(sentence):
    """
    Checks if a sentence is valid:
    - Contains at least one Hebrew character.
    - Is not composed entirely of non-letters or symbols.
    - Does not appear incomplete (e.g., ends with `...` or `- - -`).
    """
    if not re.search(r"[\u0590-\u05FF]", sentence):
        return False

    if re.fullmatch(r"[^\w\u0590-\u05FF]+", sentence):
        return False

    if re.search(r"\.\.\.|- - -", sentence):
        return False

    return True

def tokenize_sentence(sentence):
    """
    Tokenizes a sentence into words and punctuation, handling special cases.
    - Punctuation marks are separate tokens unless part of an abbreviation or number.
    - Handles Hebrew and English text.
    """
    abbreviation_pattern = re.compile(r"\b(?:e\.g\.|i\.e\.|Mr\.|Mrs\.|Dr\.|Prof\.)\b", re.IGNORECASE)
    
    abbreviations = {}
    for match in abbreviation_pattern.findall(sentence):
        placeholder = f"{{abbrev_{len(abbreviations)}}}"
        abbreviations[placeholder] = match
        sentence = sentence.replace(match, placeholder)

    tokens = []
    for word in sentence.split():
        tokens.extend(re.findall(r"\w+|[^\w\s]", word))

    tokens = [abbreviations.get(token, token) for token in tokens]

    return tokens

folder_path = "C:/Users/Alpha/Downloads/knesset_protocols/protocol_for_hw1/"
protocol_files = [f for f in os.listdir(folder_path) if f.endswith(".docx")]

protocol_data = []
speaker_data = []

print("found files:")
for file in protocol_files:
    match = re.search(r'(\d+)_pt', file)
    knesset_number = int(match.group(1)) if match else -1

    if "ptm" in file:
        protocol_type = "plenary"
    elif "ptv" in file:
        protocol_type = "committee"
    else:
        protocol_type = "undefined"

    protocol_number = None 

    try:
        doc_path = os.path.join(folder_path, file)
        doc = Document(doc_path)

        for paragraph in doc.paragraphs[:10]:
            match = re.search(r"פרוטוקול מס'? (\d+)", paragraph.text)
            if match:
                protocol_number = int(match.group(1))
                break
        if protocol_number is None:
            protocol_number = -1

        last_speaker = None
        for paragraph in doc.paragraphs:
            text = paragraph.text.strip()

            if not text:
                continue

            speaker_match = re.match(r"^([\u0590-\u05FF\w\s\(\)]+):", text)
            if speaker_match:
                raw_name = speaker_match.group(1)
                name = re.sub(r"\s*\(.*?\)", "", raw_name).strip()
                name = re.sub(r"^(Chairman|Dr\.|Mr\.|Ms\.|Mrs\.)\s*", "", name)

                spoken_text = text[len(speaker_match.group(0)):].strip()
                last_speaker = name

                sentences = divide_into_sentences(spoken_text)

                for sentence in sentences:
                    if is_valid_sentence(sentence):
                        tokens = tokenize_sentence(sentence)
                        if len(tokens) >= 4:
                            speaker_data.append({
                                "File Name": file,
                                "Knesset Number": knesset_number,
                                "Protocol Type": protocol_type,
                                "Speaker Name": name,
                                "Sentence": sentence,
                                "Tokens": " ".join(tokens)
                            })
            elif last_speaker:
                additional_sentences = divide_into_sentences(text)
                for sentence in additional_sentences:
                    if is_valid_sentence(sentence):
                        tokens = tokenize_sentence(sentence)
                        if len(tokens) >= 4:
                            speaker_data.append({
                                "File Name": file,
                                "Knesset Number": knesset_number,
                                "Protocol Type": protocol_type,
                                "Speaker Name": last_speaker,
                                "Sentence": sentence,
                                "Tokens": " ".join(tokens)
                            })

    except Exception as e:
        print(f"Error processing file {file}: {e}")

    protocol_data.append({
        "File name": file,
        "Knesset Number": knesset_number,
        "Protocol Type": protocol_type,
        "Protocol Number": protocol_number
    })

protocol_df = pd.DataFrame(protocol_data)
protocol_df.to_csv("C:/Users/Alpha/Downloads/knesset_protocols/protocol_data.csv", index=False, encoding='utf-8-sig')

speaker_sentence_df = pd.DataFrame(speaker_data)
speaker_sentence_df.to_csv("C:/Users/Alpha/Downloads/knesset_protocols/speaker_sentence_data.csv", index=False, encoding='utf-8-sig')

print("Protocol Data:")
print(protocol_df)

print("\nSpeaker Sentence Data:")
print(speaker_sentence_df)


found files:
Protocol Data:
                      File name  Knesset Number Protocol Type  Protocol Number
0            13_ptm_532058.docx              13       plenary               -1
1            13_ptm_532066.docx              13       plenary               -1
2            13_ptm_532240.docx              13       plenary               -1
3            13_ptm_532389.docx              13       plenary               -1
4            14_ptm_532484.docx              14       plenary               -1
..                          ...             ...           ...              ...
95           23_ptv_600338.docx              23     committee              198
96          25_ptv_1219728.docx              25     committee                1
97          25_ptv_1457545.docx              25     committee               10
98          25_ptv_3841247.docx              25     committee              110
99  for_test_23_ptv_585004.docx              23     committee               73

[100 rows x 4 columns]
