In [17]:
import os
import requests
import docx
try:
    import win32com.client as win32
    from win32com.client import constants
except ImportError:
    pass
from io import BytesIO

### sometimes cache issues occur - delete it from C:\Users\USER_NAME\AppData\Local\Temp\gen_py
def get_doc_from_url(url):
    try:
        content = requests.get(url).content
        
        if content.startswith(b"PK"):
            # assume it's a docx
            buffer = BytesIO(content)
        else:
            # assume it's a doc
            filepath = os.path.join(os.getcwd(), 'tmp.doc')
            
            with open(filepath, 'wb') as file:
                file.write(content)

            word = win32.gencache.EnsureDispatch('Word.Application')
            doc = word.Documents.Open(filepath)
            doc.Activate()
            word.ActiveDocument.SaveAs(filepath, FileFormat=constants.wdFormatXMLDocument)
            doc.Close(False)
            
            buffer = BytesIO(open(filepath, "rb").read())
            os.remove(filepath)
            
        buffer.seek(0)
        doc = docx.Document(buffer)
        return doc
    except Exception as e:
        print(f"{e} - Cannot get content properly from", url)

In [3]:
import docx
import pandas as pd
import swifter
import os
import re

### Algorithm to extract protocol data ###

relevant_tags = ['דובר-המשך', 'קריאה', 'קריאות', 'יור', 'דובר', 'אורח', 'קריאה', 'דובר_המשך']

def is_speaker(p):
    if is_speaker_strong(p):
        return True
    return p.text.endswith(':') and p.runs[0].underline

def is_speaker_strong(p):
    return p.text and len(p.text.strip()) > 2 and p.style.name in relevant_tags

def process_protocols(df):
    dfs = []
    non_existent = 0
    j = 0
    for i, row in df.iterrows():
        j += 1
        print(f'processing row {j} - {round(j /df.shape[0] * 100, 2)}%, S: {non_existent}                    ', end='\r')
        
        url = row['FilePath']
        document = get_doc_from_url(url)

        running_index = 0
        latest_speaker = ''
        should_record = False
        records = []
        try:
            for p in document.paragraphs:
                if (not should_record and is_speaker_strong(p)) or (should_record and is_speaker(p)):
                    should_record = True
                    latest_speaker = p.text
                    continue

                if should_record and len(p.text) > 1 and '<<' not in p.text:
                    running_index += 1
                    records.append({'Index': running_index, 'Speaker': latest_speaker, 'RawText': p.text, '$Type': "Committee"})
            file_df = pd.DataFrame.from_records(records)
            file_df['DocumentCommitteeSessionID'] = row['DocumentCommitteeSessionID']
            file_df['StartDate'] = row['StartDate']

            if len(file_df):
                dfs.append(file_df)       
        except Exception as e:
            print(f'Error: {e}', url)
            import traceback
            traceback.print_exc()

    df = pd.concat(dfs, ignore_index=True)    
    #df.to_csv('CommitteeQuotesSince2015.csv', index=False, encoding='utf-8-sig')
    return df

In [5]:
import sqlalchemy
import urllib
#import pyodbc
from sqlalchemy import event

# Set DB connection
params = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER=tcp:<SERVER>;PORT=1433;DATABASE=<DATABASE>;UID=<USER_ID>;PWD=<PASSOWRD>'
db_params = urllib.parse.quote_plus(params)
engine = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect={}".format(db_params))      
df = pd.read_sql(
"""
SELECT s.StartDate, d.DocumentCommitteeSessionID, d.FilePath 
FROM KNS_CommitteeSession s
JOIN KNS_DocumentCommitteeSession d on d.CommitteeSessionID = s.CommitteeSessionID 
WHERE d.ApplicationID = 1
AND d.GroupTypeID = 23
AND s.StartDate > DATEADD(MONTH, -1, GETDATE())
ORDER BY s.StartDate ASC
""", engine)

Unnamed: 0,StartDate,DocumentCommitteeSessionID,FilePath
0,2021-03-14 14:30:00,524895,https://fs.knesset.gov.il/23/Committees/23_ptv...
1,2021-03-15 09:30:00,525522,https://fs.knesset.gov.il/23/Committees/23_ptv...
2,2021-03-15 11:00:00,524848,https://fs.knesset.gov.il/23/Committees/23_ptv...
3,2021-03-15 11:30:00,525241,https://fs.knesset.gov.il/23/Committees/23_ptv...
4,2021-03-15 11:30:00,525246,https://fs.knesset.gov.il/23/Committees/23_ptv...
...,...,...,...
1501,2022-02-27 12:00:00,548237,https://fs.knesset.gov.il/24/Committees/24_ptv...
1502,2022-02-28 08:30:00,548238,https://fs.knesset.gov.il/24/Committees/24_ptv...
1503,2022-02-28 10:00:00,548239,https://fs.knesset.gov.il/24/Committees/24_ptv...
1504,2022-03-01 11:30:00,548537,https://fs.knesset.gov.il/24/Committees/24_ptv...


In [16]:
quotes = process_protocols(df)
display(quotes)

local variable 'filepath' referenced before assignment - Cannot get content properly from https://fs.knesset.gov.il/24/Committees/24_ptv_619729.doc
Error: 'NoneType' object has no attribute 'paragraphs' https://fs.knesset.gov.il/24/Committees/24_ptv_619729.doc
processing row 1419 - 94.22%, S: 0                    

Traceback (most recent call last):
  File "/var/folders/rb/jgsgx6y50419kr6gwygl985c0000gn/T/ipykernel_37651/1599715337.py", line 35, in process_protocols
    for p in document.paragraphs:
AttributeError: 'NoneType' object has no attribute 'paragraphs'


processing row 1506 - 100.0%, S: 0                    

Unnamed: 0,Index,Speaker,RawText,$Type,DocumentCommitteeSessionID,StartDate
0,1,"<< יור >> היו""ר חיים כץ: << יור >>","צוהריים טובים. היום יום ראשון בשבוע, 14 במרץ 2...",Committee,524895,2021-03-14 14:30:00
1,2,"<< יור >> היו""ר חיים כץ: << יור >>",בדיון האחרון העליתי את נושא ניצולי שואה. אני א...,Committee,524895,2021-03-14 14:30:00
2,3,"<< יור >> היו""ר חיים כץ: << יור >>",הפריע לי מאוד – דיברנו על זה שאדם עובר בשקל אח...,Committee,524895,2021-03-14 14:30:00
3,4,"<< יור >> היו""ר חיים כץ: << יור >>",המשכתי לפרק את זה. כמה עוברים ב-10 שקלים? מצאת...,Committee,524895,2021-03-14 14:30:00
4,5,"<< יור >> היו""ר חיים כץ: << יור >>",נכנסנו עוד לעומק והסתכלנו על הגילים של אלה שעו...,Committee,524895,2021-03-14 14:30:00
...,...,...,...,...,...,...
548845,4,"<< יור >> היו""ר מיכאל מרדכי ביטון: << יור >>",אין לנו. למה הוא לא פה עכשיו?,Committee,548528,2022-03-07 13:00:00
548846,5,<< דובר_המשך >> אפרת פרוקצ'יה: << דובר_המשך >...,הוא נמצא בקבינט כרגע.,Committee,548528,2022-03-07 13:00:00
548847,6,"<< יור >> היו""ר מיכאל מרדכי ביטון: << יור >>","אני אגיד לך משהו. אביעד פרידמן, המנכ""ל, פנה אל...",Committee,548528,2022-03-07 13:00:00
548848,7,<< אורח >> אפרת פרוקצ'יה: << אורח >>,אני רק אגיד - - -,Committee,548528,2022-03-07 13:00:00


In [18]:
import functools
import swifter 
import re

re_remove = re.compile('<<.+?>>|\(.+?\)|היו\"ר|[^\sא-ת]')
re_space = re.compile('\s+')
re_tags = re.compile('<<.+?>>')

def parse_person(p):
    first_name = p.CleanedFirstName
    last_name = p.CleanedLastName
    s = set()
    s.update(first_name.split())
    s.update(last_name.split())
    
    return {
        'first': first_name,
        'first_split': first_name.split(),
        'last': last_name,
        'last_split': last_name.split(),
        'full_name': f'{first_name} {last_name}',
        'parts': s,
    }

@functools.lru_cache(64000)
def name_to_id(name):
    #if name in parsed_name_to_id_cache:
        #return parsed_name_to_id_cache[name]
    
    name_split = name.split()
    
    if len(name_split) < 2 or len(name_split) > 10:
        return None
    
    for id_, parsed in parsed_person.items():
        # exact match (order doesnt matter)?
        if all(map(lambda n: n in parsed['parts'], name_split)):
            return id_
        
    for id_, parsed in parsed_person.items():
        # partial match (last name equals, first name contains at least one piece)
        if parsed['last'] in name_split and any(map(lambda n: n in name_split, parsed['first_split'])):
            return id_
    
    for id_, parsed in parsed_person.items():
        # full name substring
        if parsed['full_name'] in name:
            return id_
    """
    for id_, parsed in parsed_person.items():
        # split name substring
        if parsed['first'] in name and parsed['last'] in name:
            return id_
            """

def clean_speaker(txt):
    try:
        txt = txt.replace('יי', 'י') #in notebook the hebrew looks RTL in the replacement
        txt = txt.replace('וו', 'ו') #in notebook the hebrew looks RTL in the replacement
        txt = txt.replace('אא', 'א') #in notebook the hebrew looks RTL in the replacement
        txt = re_remove.sub('', txt)
        txt = re_space.sub(' ', txt)
        txt = re_tags.sub('', txt)
        txt = txt.strip()
    except:
        print(txt)
        raise
    return txt
    
speakers = quotes["Speaker"].swifter.apply(clean_speaker).unique()

# Load person table
person = pd.read_sql(
"""
SELECT p.PersonID, p.FirstName, p.LastName FROM KNS_Person p 
JOIN KNS_PersonToPosition kptp on kptp.PersonID = p.PersonID
WHERE StartDate > '2000-01-01'
GROUP BY p.PersonID, p.FirstName, p.LastName 
ORDER BY MAX(kptp.StartDate) DESC
""", engine)

person['CleanedFirstName'] = person['FirstName'].swifter.apply(clean_speaker)
person['CleanedLastName'] = person['LastName'].swifter.apply(clean_speaker)
    
# Map extracted speaker name to person ID
parsed_person = {}
for _, p in person.iterrows():
    parsed_person[p.PersonID] = parse_person(p)

name_mapping = {}    
total = len(speakers)
print('Speakers:', total)

for i, name in enumerate(speakers):
    try:
        name_mapping[name] = name_to_id(name)
    except:
        name_mapping[name] = None
    print(f'{i}, {round((i+1) / total * 100, 2)}%      ', end='\r')

# Apply name mapping
quotes['PersonID'] = quotes['Speaker'].swifter.apply(clean_speaker).apply(lambda x: name_mapping[x])

display(quotes)

Pandas Apply:   0%|          | 0/548850 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/553 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/553 [00:00<?, ?it/s]

Speakers: 6385
1231, 19.3%       6384, 100.0%      

Pandas Apply:   0%|          | 0/548850 [00:00<?, ?it/s]

Unnamed: 0,Index,Speaker,RawText,$Type,DocumentCommitteeSessionID,StartDate,PersonID
0,1,"<< יור >> היו""ר חיים כץ: << יור >>","צוהריים טובים. היום יום ראשון בשבוע, 14 במרץ 2...",Committee,524895,2021-03-14 14:30:00,556.0
1,2,"<< יור >> היו""ר חיים כץ: << יור >>",בדיון האחרון העליתי את נושא ניצולי שואה. אני א...,Committee,524895,2021-03-14 14:30:00,556.0
2,3,"<< יור >> היו""ר חיים כץ: << יור >>",הפריע לי מאוד – דיברנו על זה שאדם עובר בשקל אח...,Committee,524895,2021-03-14 14:30:00,556.0
3,4,"<< יור >> היו""ר חיים כץ: << יור >>",המשכתי לפרק את זה. כמה עוברים ב-10 שקלים? מצאת...,Committee,524895,2021-03-14 14:30:00,556.0
4,5,"<< יור >> היו""ר חיים כץ: << יור >>",נכנסנו עוד לעומק והסתכלנו על הגילים של אלה שעו...,Committee,524895,2021-03-14 14:30:00,556.0
...,...,...,...,...,...,...,...
548845,4,"<< יור >> היו""ר מיכאל מרדכי ביטון: << יור >>",אין לנו. למה הוא לא פה עכשיו?,Committee,548528,2022-03-07 13:00:00,30682.0
548846,5,<< דובר_המשך >> אפרת פרוקצ'יה: << דובר_המשך >...,הוא נמצא בקבינט כרגע.,Committee,548528,2022-03-07 13:00:00,
548847,6,"<< יור >> היו""ר מיכאל מרדכי ביטון: << יור >>","אני אגיד לך משהו. אביעד פרידמן, המנכ""ל, פנה אל...",Committee,548528,2022-03-07 13:00:00,30682.0
548848,7,<< אורח >> אפרת פרוקצ'יה: << אורח >>,אני רק אגיד - - -,Committee,548528,2022-03-07 13:00:00,


In [19]:
import math
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
from datetime import datetime

quotes.replace({math.nan: None}, inplace=True)

es = Elasticsearch(["ELASTICSERVER"], http_auth=('USERNAME', 'PASSWORD'))

def gen():
    j = 0
    for i, row in quotes.iterrows():
        j = j + 1
        row = dict(row)
        if row["$Type"] == "Committee":    
            row["DocumentID"] = row.pop("DocumentCommitteeSessionID")
        else:
            row["DocumentID"] = row.pop("DocumentPlenumSessionID")
        row["$Timestamp"] = datetime.now()
        row["_index"] = "quotes"
        row["_id"] = f'{row["$Type"]}:{row["DocumentID"]}:{row["Index"]}'
        yield row
        if j % 1000 == 0:
            print(f'{j}          ', end='\r')
        
for success, info in parallel_bulk(es, gen()):
    if not success:
        print('failed', info)
        
es.transport.close()

  es = Elasticsearch([""], http_auth=('elastic', ''))


548000          

UnsupportedProductError: The client noticed that the server is not Elasticsearch and we do not support this unknown product