In [34]:
import os
import requests
import docx2txt
import docx
import win32com.client as win32
from win32com.client import constants

### sometimes cache issues occur - delete it from C:\Users\USER_NAME\AppData\Local\Temp\gen_py
def get_doc_from_url(url):
    try:
        content = requests.get(url).content
        filepath = os.path.join(os.getcwd(), 'tmp.doc')

        with open(filepath, 'wb') as file:
            file.write(content)

        word = win32.gencache.EnsureDispatch('Word.Application')
        doc = word.Documents.Open(filepath)
        doc.Activate()
        word.ActiveDocument.SaveAs(filepath, FileFormat=constants.wdFormatXMLDocument)
        doc.Close(False)
        doc = docx.Document(filepath)
        os.remove(filepath)
        return doc
    except:
        print("Cannot get content properly from", url)

In [35]:
import docx
import docx2txt
import pandas as pd
import swifter
import os
import re

### Algorithm to extract protocol data ###

relevant_tags = ['דובר-המשך', 'קריאה', 'קריאות', 'יור', 'דובר', 'אורח', 'קריאה', 'דובר_המשך']

def is_speaker(p):
    if is_speaker_strong(p):
        return True
    return p.text.endswith(':') and p.runs[0].underline

def is_speaker_strong(p):
    return p.text and len(p.text.strip()) > 2 and p.style.name in relevant_tags

def process_protocols(df):
    dfs = []
    non_existent = 0
    j = 0
    for i, row in df.iterrows():
        j += 1
        print(f'processing row {j} - {round(j /df.shape[0] * 100, 2)}%, S: {non_existent}                    ', end='\r')
        
        url = row['FilePath']
        document = get_doc_from_url(url)

        running_index = 0
        latest_speaker = ''
        should_record = False
        records = []
        try:
            for p in document.paragraphs:
                if (not should_record and is_speaker_strong(p)) or (should_record and is_speaker(p)):
                    should_record = True
                    latest_speaker = p.text
                    continue

                if should_record and len(p.text) > 1 and '<<' not in p.text:
                    running_index += 1
                    records.append({'Index': running_index, 'Speaker': latest_speaker, 'RawText': p.text, '$Type': "Committee"})
            file_df = pd.DataFrame.from_records(records)
            file_df['DocumentCommitteeSessionID'] = row['DocumentCommitteeSessionID']
            file_df['StartDate'] = row['StartDate']

            if len(file_df):
                dfs.append(file_df)       
        except:
            print('Error: ', file)
            import traceback
            traceback.print_exc()

    df = pd.concat(dfs, ignore_index=True)    
    #df.to_csv('CommitteeQuotesSince2015.csv', index=False, encoding='utf-8-sig')
    return df

In [36]:
import sqlalchemy
import urllib
import pyodbc
from sqlalchemy import event

# Set DB connection
params = 'DRIVER={ODBC Driver 17 for SQL Server};SERVER=tcp:<SERVER>;PORT=1433;DATABASE=<DATABASE>;UID=<USER_ID>;PWD=<PASSOWRD>'
db_params = urllib.parse.quote_plus(params)
engine = sqlalchemy.create_engine("mssql+pyodbc:///?odbc_connect={}".format(db_params))      
df = pd.read_sql(
"""
SELECT s.StartDate, d.DocumentCommitteeSessionID, d.FilePath 
FROM KNS_CommitteeSession s
JOIN KNS_DocumentCommitteeSession d on d.CommitteeSessionID = s.CommitteeSessionID 
WHERE d.ApplicationID = 1
AND d.GroupTypeID = 23
AND s.StartDate > DATEADD(MONTH, -1, GETDATE())
ORDER BY s.StartDate ASC
""", engine)

In [37]:
quotes = process_protocols(df)
display(quotes)

processing row 89 - 100.0%, S: 0                    

Unnamed: 0,Index,Speaker,RawText,$Type,DocumentCommitteeSessionID,StartDate
0,1,"<< יור >> היו""ר צבי האוזר: << יור >>",בוקר טוב לכולם. אני מתכבד לפתוח את ישיבת ועדת ...,Committee,521680,2021-01-04 10:00:00
1,2,<< דובר >> קטי קטרין שטרית (הליכוד): << דובר >>,נכון.,Committee,521680,2021-01-04 10:00:00
2,3,"<< יור >> היו""ר צבי האוזר: << יור >>","אז משרד הביטחון הציג לו""ז לאותו תהליך מתגלגל ו...",Committee,521680,2021-01-04 10:00:00
3,4,<< דובר >> שמואל לטקו: << דובר >>,"קודם כול, למציעים אולי.",Committee,521680,2021-01-04 10:00:00
4,5,"<< יור >> היו""ר צבי האוזר: << יור >>",כן. גם למציעי הדיון. חבר הכנסת פינדרוס.,Committee,521680,2021-01-04 10:00:00
...,...,...,...,...,...,...
27653,612,"<< יור >> היו""ר חיים כץ: << יור >>",בעד – 5,Committee,524300,2021-02-23 10:00:00
27654,613,"<< יור >> היו""ר חיים כץ: << יור >>",נגד – אין,Committee,524300,2021-02-23 10:00:00
27655,614,"<< יור >> היו""ר חיים כץ: << יור >>",נמנעים – אין,Committee,524300,2021-02-23 10:00:00
27656,615,"<< יור >> היו""ר חיים כץ: << יור >>",אושרה.,Committee,524300,2021-02-23 10:00:00


In [38]:
import functools
import swifter 
import re

re_remove = re.compile('<<.+?>>|\(.+?\)|היו\"ר|[^\sא-ת]')
re_space = re.compile('\s+')
re_tags = re.compile('<<.+?>>')

def parse_person(p):
    first_name = p.CleanedFirstName
    last_name = p.CleanedLastName
    s = set()
    s.update(first_name.split())
    s.update(last_name.split())
    
    return {
        'first': first_name,
        'first_split': first_name.split(),
        'last': last_name,
        'last_split': last_name.split(),
        'full_name': f'{first_name} {last_name}',
        'parts': s,
    }

@functools.lru_cache(64000)
def name_to_id(name):
    #if name in parsed_name_to_id_cache:
        #return parsed_name_to_id_cache[name]
    
    name_split = name.split()
    
    if len(name_split) < 2 or len(name_split) > 10:
        return None
    
    for id_, parsed in parsed_person.items():
        # exact match (order doesnt matter)?
        if all(map(lambda n: n in parsed['parts'], name_split)):
            return id_
        
    for id_, parsed in parsed_person.items():
        # partial match (last name equals, first name contains at least one piece)
        if parsed['last'] in name_split and any(map(lambda n: n in name_split, parsed['first_split'])):
            return id_
    
    for id_, parsed in parsed_person.items():
        # full name substring
        if parsed['full_name'] in name:
            return id_
    """
    for id_, parsed in parsed_person.items():
        # split name substring
        if parsed['first'] in name and parsed['last'] in name:
            return id_
            """

def clean_speaker(txt):
    try:
        txt = txt.replace('יי', 'י') #in notebook the hebrew looks RTL in the replacement
        txt = txt.replace('וו', 'ו') #in notebook the hebrew looks RTL in the replacement
        txt = txt.replace('אא', 'א') #in notebook the hebrew looks RTL in the replacement
        txt = re_remove.sub('', txt)
        txt = re_space.sub(' ', txt)
        txt = re_tags.sub('', txt)
        txt = txt.strip()
    except:
        print(txt)
        raise
    return txt
    
speakers = quotes["Speaker"].swifter.apply(clean_speaker).unique()

# Load person table
person = pd.read_sql(
"""
SELECT p.PersonID, p.FirstName, p.LastName FROM KNS_Person p 
JOIN KNS_PersonToPosition kptp on kptp.PersonID = p.PersonID
WHERE StartDate > '2000-01-01'
GROUP BY p.PersonID, p.FirstName, p.LastName 
ORDER BY MAX(kptp.StartDate) DESC
""", engine)

person['CleanedFirstName'] = person['FirstName'].swifter.apply(clean_speaker)
person['CleanedLastName'] = person['LastName'].swifter.apply(clean_speaker)
    
# Map extracted speaker name to person ID
parsed_person = {}
for _, p in person.iterrows():
    parsed_person[p.PersonID] = parse_person(p)

name_mapping = {}    
total = len(speakers)
print('Speakers:', total)

for i, name in enumerate(speakers):
    try:
        name_mapping[name] = name_to_id(name)
    except:
        name_mapping[name] = None
    print(f'{i}, {round((i+1) / total * 100, 2)}%      ', end='\r')

# Apply name mapping
quotes['PersonID'] = quotes['Speaker'].swifter.apply(clean_speaker).apply(lambda x: name_mapping[x])

display(quotes)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=27658.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=512.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=512.0, style=ProgressStyle(description…


Speakers: 616
615, 100.0%      

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=27658.0, style=ProgressStyle(descripti…




Unnamed: 0,Index,Speaker,RawText,$Type,DocumentCommitteeSessionID,StartDate,PersonID
0,1,"<< יור >> היו""ר צבי האוזר: << יור >>",בוקר טוב לכולם. אני מתכבד לפתוח את ישיבת ועדת ...,Committee,521680,2021-01-04 10:00:00,30684.0
1,2,<< דובר >> קטי קטרין שטרית (הליכוד): << דובר >>,נכון.,Committee,521680,2021-01-04 10:00:00,30706.0
2,3,"<< יור >> היו""ר צבי האוזר: << יור >>","אז משרד הביטחון הציג לו""ז לאותו תהליך מתגלגל ו...",Committee,521680,2021-01-04 10:00:00,30684.0
3,4,<< דובר >> שמואל לטקו: << דובר >>,"קודם כול, למציעים אולי.",Committee,521680,2021-01-04 10:00:00,1148.0
4,5,"<< יור >> היו""ר צבי האוזר: << יור >>",כן. גם למציעי הדיון. חבר הכנסת פינדרוס.,Committee,521680,2021-01-04 10:00:00,30684.0
...,...,...,...,...,...,...,...
27653,612,"<< יור >> היו""ר חיים כץ: << יור >>",בעד – 5,Committee,524300,2021-02-23 10:00:00,556.0
27654,613,"<< יור >> היו""ר חיים כץ: << יור >>",נגד – אין,Committee,524300,2021-02-23 10:00:00,556.0
27655,614,"<< יור >> היו""ר חיים כץ: << יור >>",נמנעים – אין,Committee,524300,2021-02-23 10:00:00,556.0
27656,615,"<< יור >> היו""ר חיים כץ: << יור >>",אושרה.,Committee,524300,2021-02-23 10:00:00,556.0


In [39]:
import math
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, parallel_bulk
from datetime import datetime

quotes.replace({math.nan: None}, inplace=True)

es = Elasticsearch(["ELASTIC_SERVER"], http_auth=('USERNAME', 'PASSWORD'))

def gen():
    j = 0
    for i, row in quotes.iterrows():
        j = j + 1
        row = dict(row)
        if row["$Type"] == "Committee":    
            row["DocumentID"] = row.pop("DocumentCommitteeSessionID")
        else:
            row["DocumentID"] = row.pop("DocumentPlenumSessionID")
        row["$Timestamp"] = datetime.now()
        row["_index"] = "quotes"
        row["_id"] = f'{row["$Type"]}:{row["DocumentID"]}:{row["Index"]}'
        yield row
        if j % 1000 == 0:
            print(f'{j}          ', end='\r')
        
for success, info in parallel_bulk(es, gen()):
    if not success:
        print('failed', info)
        
es.transport.close()

27000          