In [165]:
import requests, datetime, time
from pyquery import PyQuery as pq
from dataflows import Flow, printer, dump_to_path, sort_rows


def get_messages(before_id=None):
    url = 'https://t.me/s/MOHreport'
    if before_id:
        url += '?before=' + str(before_id)
    print('loading ' + url)
    for message in pq(requests.get(url).text)('[data-post]'):
        message_id = int(message.attrib['data-post'].replace('MOHreport/', ''))
        date_elts = message.find_class('tgme_widget_message_date')
        assert len(date_elts) == 1
        date_elt = date_elts[0]
        message_datetime = next(date_elt.iterchildren()).attrib['datetime']
        message_datetime = "".join(reversed("".join(reversed(message_datetime)).replace(':','',1)))
        message_datetime = datetime.datetime.strptime(message_datetime, '%Y-%m-%dT%H:%M:%S%z')
        content_elts = message.find_class('tgme_widget_message_bubble')
        assert len(content_elts) == 1
        content_elt = content_elts[0]
        message_htmls = []
        image_urls = []
        for child in content_elt.iterchildren():
            if 'tgme_widget_message_text' in list(child.classes):
                message_htmls.append(pq(child).html())
            elif 'tgme_widget_message_photo_wrap' in list(child.classes):
                image_urls.append(child.attrib['style'].split("url('")[1].split("'")[0])        
        message_html = "<br/><br/>".join(message_htmls)
        message_text = message_html.replace('<br/>', "\n")
        image_urls = ",".join(image_urls)
        yield {'id': message_id, 'date': message_datetime, 'text': message_text, 'images': image_urls}

        
def get_all_messages():
    last_message_id = None
    num_messages = 0
    while True:
        if num_messages > 0 and num_messages % 500 == 0: print('Loaded ' + str(num_messages) + ' messages..')
        if last_message_id and last_message_id <= 2525: break
        for message in get_messages(last_message_id):
            if not last_message_id or message['id'] < last_message_id:
                last_message_id = message['id']
            yield message
            num_messages += 1
        print('sleeping .1 seconds..')
        time.sleep(.1)


Flow(
    get_all_messages(),
    sort_rows('{date}', reverse=True),
    printer(tablefmt='html', num_rows=1),
    dump_to_path('data/MOHReport')
).process()

loading https://t.me/s/MOHreport
loading https://t.me/s/MOHreport?before=3085
loading https://t.me/s/MOHreport?before=3065
loading https://t.me/s/MOHreport?before=3045
loading https://t.me/s/MOHreport?before=3025


loading https://t.me/s/MOHreport?before=3004
loading https://t.me/s/MOHreport?before=2984
loading https://t.me/s/MOHreport?before=2964
loading https://t.me/s/MOHreport?before=2943
loading https://t.me/s/MOHreport?before=2922
loading https://t.me/s/MOHreport?before=2902
loading https://t.me/s/MOHreport?before=2882
loading https://t.me/s/MOHreport?before=2862
loading https://t.me/s/MOHreport?before=2841
loading https://t.me/s/MOHreport?before=2821
loading https://t.me/s/MOHreport?before=2800
loading https://t.me/s/MOHreport?before=2779
loading https://t.me/s/MOHreport?before=2759
loading https://t.me/s/MOHreport?before=2738
loading https://t.me/s/MOHreport?before=2717
loading https://t.me/s/MOHreport?before=2697
loading https://t.me/s/MOHreport?before=2677
loading https://t.me/s/MOHreport?before=2657
loading https://t.me/s/MOHreport?before=2635
loading https://t.me/s/MOHreport?before=2615
loading https://t.me/s/MOHreport?before=2593
loading https://t.me/s/MOHreport?before=2569
loading ht

#,id (integer),date (datetime),text (string),images (string),Unnamed: 5,Unnamed: 6
1,3104.0,2020-03-15 18:08:55,חולה 187- עדכון מקום שהייה (8.3) 3103,2020-03-15 17:45:47,"הודעה לציבור- חולה קורונה- חולה מספר 205 >>>>>> יום א', 15.3, 19:00 החולה בשנות ה- 60 לחייה, ממרכז ...",
...,,,,,,
520,2511.0,2020-01-30 12:10:52,,,,


(<datapackage.package.Package at 0x7f768439f048>,
 {'count_of_rows': 520,
  'bytes': 562595,
  'hash': '62ae5e2748ed5f7801e97bcd8e7c5c6e',
  'dataset_name': None})

In [106]:
import os
CKAN_URL = 'https://www.odata.org.il'
if not os.environ.get('CKAN_API_KEY'):
    import getpass
    CKAN_API_KEY = getpass.getpass('CKAN_API_KEY')

CKAN_API_KEY ····································


In [166]:
from dataflows import load
import json

data = Flow(
    load('data/MOHReport/datapackage.json')
).results()[0][0]

def format_row(row):
    row['date'] = row['date'].strftime('%Y-%m-%dT%H:%M:%S')
    row['images'] = '' if not row['images'] else row['images']
    return row

records = [format_row(row) for row in data]

print(records[0])

res = requests.post('https://www.odata.org.il/api/3/action/datastore_create', json={
    'resource_id': 'ce4c9482-cd3a-485b-af56-d3d7118a7552',
    'force': True,
    'primary_key': ['id'],
}, headers={'Authorization':CKAN_API_KEY})
print(res.status_code)
print(res.text)
assert res.status_code == 200

res = requests.post('https://www.odata.org.il/api/3/action/datastore_upsert', json={
    'resource_id': 'ce4c9482-cd3a-485b-af56-d3d7118a7552',
    'records': records,
    'method': 'upsert',
    'force': True
}, headers={'Authorization':CKAN_API_KEY})
print(res.status_code)
# print(res.text)
assert res.status_code == 200

{'id': 3104, 'date': '2020-03-15T18:08:55', 'text': 'חולה 187- עדכון מקום שהייה (8.3)\n<a href="https://t.me/MOHreport/3060" target="_blank" rel="noopener">https://t.me/MOHreport/3060</a>', 'images': ''}
200
{"help": "https://www.odata.org.il/api/3/action/help_show?name=datastore_create", "success": true, "result": {"method": "insert", "primary_key": ["id"], "resource_id": "ce4c9482-cd3a-485b-af56-d3d7118a7552"}}
200
