In [None]:
import pandas as pd
import docx as dx
import matplotlib.pyplot as plt
from docx import Document
import os
import re

folder_path = "C:/Users/Alpha/Downloads/knesset_protocols/protocol_for_hw1/"
protocol_files = [f for f in os.listdir(folder_path) if f.endswith(".docx")]
protocol_data = []
speaker_data = []

print("found files:")
for file in protocol_files:
    match = re.search(r'(\d+)_pt', file)
    if match:
        knesset_number = int(match.group(1))
    else:
        knesset_number = -1
    if "ptm" in file:
        protocol_type = "plenary"
    elif "ptv" in file:
        protocol_type = "committee"
    else:
        protocol_type = "undefined"

    protocol_number = None
    try:
        doc_path = os.path.join(folder_path, file)
        doc = Document(doc_path)
        for paragraph in doc.paragraphs[:10]:
            match = re.search(r"פרוטוקול מס'? (\d+)", paragraph.text)
            if match:
                protocol_number = int(match.group(1))
                break
            else:
                protocol_number = -1
        last_speaker = None
        for paragraph in doc.paragraphs:
            text = paragraph.text.strip()

            if not text:
                continue

            speaker_match = re.match(r"^([\u0590-\u05FF\w\s\(\)]+):", text)
            if speaker_match:
                raw_name = speaker_match.group(1)
                name = re.sub(r"\s*\(.*?\)", "", raw_name).strip()
                name = re.sub(r"^(Chairman|Dr\.|Mr\.|Ms\.|Mrs\.)\s*", "", name)

                spoken_text = text[len(speaker_match.group(0)):].strip()
                last_speaker = name

                speaker_data.append({
                    "File Name": file,
                    "Speaker Name": name,
                    "Text": spoken_text
                })
            elif last_speaker:
                speaker_data[-1]["Text"] += " " + text
    except Exception as e:
        print(f"Error processing file {file}: {e}")
    
    protocol_data.append({
        "File name": file,
        "Knesset Number": knesset_number,
        "Protocol Type": protocol_type,
        "Protocol Number": protocol_number
    })

protocol_df = pd.DataFrame(protocol_data)
protocol_df.to_csv("C:/Users/Alpha/Downloads/knesset_protocols/protocol_data.csv", index=False, encoding='utf-8-sig')

speaker_df = pd.DataFrame(speaker_data)
speaker_df.to_csv("C:/Users/Alpha/Downloads/knesset_protocols/speaker_data.csv", index=False, encoding='utf-8-sig')

print(protocol_df)
print(speaker_df)

found files:
                      File name  Knesset Number Protocol Type  Protocol Number
0            13_ptm_532058.docx              13       plenary               -1
1            13_ptm_532066.docx              13       plenary               -1
2            13_ptm_532240.docx              13       plenary               -1
3            13_ptm_532389.docx              13       plenary               -1
4            14_ptm_532484.docx              14       plenary               -1
..                          ...             ...           ...              ...
95           23_ptv_600338.docx              23     committee              198
96          25_ptv_1219728.docx              25     committee                1
97          25_ptv_1457545.docx              25     committee               10
98          25_ptv_3841247.docx              25     committee              110
99  for_test_23_ptv_585004.docx              23     committee               73

[100 rows x 4 columns]
               