In [143]:
import requests
from pyquery import PyQuery as pq
from dataflows import Flow, printer, update_resource, dump_to_path, add_field, delete_fields, set_type

def get_doc():
    res = requests.get('https://govextra.gov.il/ministry-of-health/corona/corona-virus/spokesman-messages-corona/')
    return pq(res.text)

def get_cards():
    patients_parent = None
    for cardnum, card in enumerate(get_doc().find('.card')):
        if cardnum == 0:
            patients_parent = card.getparent()
            is_patient = True
        elif card.getparent() == patients_parent:
            is_patient = True
        else:
            is_patient = False
        yield {
            'is_patient': is_patient,
            'header': pq(pq(card).find('.card-header')).text(),
            'body': str(pq(pq(card).find('.card-body')))
        }

Flow(
    get_cards(),
    set_type('is_patient', type='boolean'),
    set_type('header', type='string'),
    set_type('body', type='string'),
    update_resource('res_1', name='all_cards', path='all_cards.csv'),
    printer(tablefmt='html', num_rows=1),
    dump_to_path('data/MOHPatients/all_cards')
).process()[1]

#,is_patient (boolean),header (string),body (string)
1,True,"חולים 332, 325, 333, 332, 343 - 19/03/2020",...
2,True,"חולים 490, 357, 373, 387, 418, 400, 356, 347, 367, 338, 378 - 19/03/2020, 19:00",...
...,,,
172,False,אזהרת מסע כוללת לאיטליה 27.02.20 - 09:50,...


{'count_of_rows': 172,
 'bytes': 556979,
 'hash': '05ba6b7919ca2c1306d22d32e543d0e7',
 'dataset_name': None}

In [145]:
import re, datetime
from dataflows import filter_rows

def try_nums(numstrings):
    for c in 'קראטוןםפשדגכעיחלךףזסבהנמצתץ':
        numstrings = numstrings.replace(c, '')
    try:
        if ',' in numstrings:
            nums  = [int(numstring.strip()) for numstring in numstrings.split(',')]
        else:
            nums = [int(numstrings.strip())]
        if len(nums) > 0:
            return nums
        else:
            return False
    except Exception:
        return False

def parse_patients(row):
    numstrings = row['header'].split('חול')[1].split('-')[0].replace('ה ', '').replace('ים ', '')
    nums = try_nums(numstrings)
    if nums == False:
        nums = try_nums(numstrings.split(',')[0])
        if nums == False:
            nums = try_nums(numstrings.split(' ')[0])
    if nums == False:
        raise Exception('invalid header: ' + row['header'])
    # 15/03/2020, 10:30
    m = re.search('([0-9]+/[0-9]+/[0-9]+)', row['header']) 
    if m is None:
        datestring = None
    else:
        datestring = m[0]
    m = re.search('([0-9]+:[0-9]+)', row['header'])
    if m is None:
        timestring = None
    else:
        timestring = m[0]
    if datestring and timestring:
        dt = datetime.datetime.strptime('%s %s' % (datestring, timestring), '%d/%m/%Y %H:%M')
    elif datestring:
        dt = datetime.datetime.strptime(datestring, '%d/%m/%Y')
    else:
        dt = None
    row['date'] = dt
    row['nums'] = nums
    

Flow(
    load('data/MOHPatients/all_cards/datapackage.json'),
    filter_rows(lambda row: row['is_patient']),
    add_field('date', type='datetime'),
    add_field('nums', type='array'),
    parse_patients,
    delete_fields(['is_patient']),
    update_resource('all_cards', name='patient_nums', path='patient_nums.csv'),
    printer(tablefmt='html', num_rows=1),
    dump_to_path('data/MOHPatients/patient_nums')
).process()[1]

#,header (string),body (string),date (datetime),nums (array)
1,"חולים 332, 325, 333, 332, 343 - 19/03/2020",...,2020-03-19 00:00:00,"[332, 325, 333, 332, 343]"
2,"חולים 490, 357, 373, 387, 418, 400, 356, 347, 367, 338, 378 - 19/03/2020, 19:00",...,2020-03-19 19:00:00,"[490, 357, 373, 387, 418, 400, 356, 347, 367, 338, 378]"
...,,,,
144,"חולה 114 - 13/03/2020, 10:30",...,2020-03-13 10:30:00,[114]


{'count_of_rows': 144,
 'bytes': 504664,
 'hash': '9a94668ba99e0b7b53dfc673e3b3de2a',
 'dataset_name': None}

In [175]:
from dataflows import load, delete_fields

def parse_headers(rows):
    for row in rows:
        print(row['header'])
        print(row['date'])
        print(row['nums'])
        print('^^^')
        header = None
        content = []
        child_rows = []
        for child in pq(pq(row['body'])('.card-body')).children():
            child_text = pq(child).text().strip()
            if child_text.strip() == '': continue
            if '<strong' in pq(child).html().lower():
                if header is not None:
                    child_rows.append({'header': header, 'content': "\n".join(content)})
                header = child_text
                content = []
            else:
                content.append(pq(child).text())
        if header is not None:
            child_rows.append({'header': header, 'content': "\n".join(content)})
        patient_nums = {}
        cur_patient_num = None
        for child_row in child_rows:
            if child_row['header'].strip().startswith('חול'):
                for patient_num in re.findall(r'[0-9]+', child_row['header']):
                    if int(patient_num) in row['nums']:
                        cur_patient_num = int(patient_num)
                        patient_nums.setdefault(cur_patient_num, []).append(child_row)
                    else:
                        raise Exception('Invalid patient num: ' + patient_num)                    
            elif cur_patient_num:
                patient_nums[cur_patient_num].append(child_row)
            else:
                patient_nums.setdefault(-1, []).append(child_row)
                cur_patient_num = None
        for num, crows in patient_nums.items():
            print(num)
            for crow in crows:
                print(crow)
            print('----')
        break
        
        
       
Flow(
    load('data/MOHPatients/patient_nums/datapackage.json'),
    add_field('main_header', type='string'),
    add_field('content', type='string'),
    parse_headers,
    delete_fields(['body']),
    printer(tablefmt='html')
)  # .process()[1]

<dataflows.base.flow.Flow at 0x7fc9f6f2cdd8>