In [None]:
import datetime
import itertools
import dask.bag as db
import pandas as pd
import json
import s3fs
import requests
from bs4 import BeautifulSoup

In [None]:
from dask.distributed import Client,progress
client = Client()

In [None]:
client

In [None]:
bucketname = 'borza-hotelcom-data'

In [None]:
def roomparse(room):
    roomdict = {}
    try:
        detli = room.find('div',{'class':'room-details'}).find_all('li')
    except:
        detli = []
        roomdict['details_missing'] = 1
    for li in detli:
        roomdict[li.text] = 1
    try:
        inputli = room.find('form').find_all('input')
    except:
        inputli = []
    for inp in inputli:
        try:
            roomdict[inp['name']] = inp['value']
        except:
            pass

    for feature in room.find_all('span',{'class':'feature-title'}):
        roomdict[feature.text] = 1

    for fun,name in [[lambda x: x.find('h3').text,'room_name'],
                    [lambda x: x.find('span',{'class':'occupancy-rate'}).text,'occupancy'],
                    [lambda x: x.find('div',{'class':'cancellation'}).text.split()[0],'cancellation'],
                    [lambda x: x.find('strong',{'class':'current-price'}).text,'price'],
                    [lambda x: x.find('div',{'class':'price'}).text,'price-2'],
                    [lambda x: x['data-index'],'data-index']]:
        try:
            roomdict[name] = fun(room)
        except:
            pass
    return roomdict

def process_date(args):
    hotelid = args[0]
    date=args[1]
    chin = date.strftime('%Y-%m-%d')
    chout = (date + datetime.timedelta(days = 1)).strftime('%Y-%m-%d')
    params = {'q-check-out':chout,
              'q-check-in':chin,
              'q-room-0-adults':2,
              'q-rooms':1,
              'q-room-0-children':0,
              'locale':'en_IE'}
    url = 'https://www.hotels.com/ho' + str(hotelid)

    r = requests.get(url,params=params)
    #print(r.url)
    rooms = BeautifulSoup(r.content,'html5lib').find_all('li',{'class':'room'})
    hotelinfo = {'hotelid':hotelid,
                'date':chin,
                'rooms':len(rooms),
                'response_code':r.status_code,
                'url':r.url,
                'downloaded_date':datetime.date.today().isoformat()}
    
    return [{**roomparse(r),**hotelinfo} for r in rooms]

In [None]:
def make_dateset(dlist):
    return pd.concat([pd.Series(pd.date_range(**d)) for d in dlist])

In [None]:
fs = s3fs.S3FileSystem()

In [None]:
with fs.open('%s/atts-v0.json' % bucketname, 'r') as f:
    hatts = json.load(f)

In [None]:
top_cities = pd.Series([h.get('dest-id',None) for h in hatts]).value_counts()

In [None]:
date_list = make_dateset([{'start':'2019-06-20','end':'2019-06-27'},
              {'start':'2019-07-10','end':'2019-07-17'},
              {'start':'2019-08-15','end':'2019-08-22'},
              {'start':'2019-09-08','end':'2019-09-15'},
              {'start':'2019-11-20','end':'2019-11-27'},
              {'start':'2019-12-20','end':'2020-01-03'}])

In [None]:
hotel_list = pd.Series([h['hotelid'] for h in hatts if h.get('dest-id',None) in top_cities.head(20).index])

In [None]:
date_sample = date_list.sample(frac=0.05)
hotel_sample = hotel_list.sample(frac=0.01)

In [None]:
input_args = itertools.product(hotel_sample,date_sample)

In [None]:
input_bag = db.from_sequence(input_args).persist().map(process_date)

In [None]:
out = input_bag.compute()

In [None]:
with fs.open('%s/dates-v0-%s.json' % (bucketname,datetime.date.today().isoformat()), 'w') as f:
    json.dump(out,f)