In [1]:
import numpy as np
import pandas as pd
from requests import Session

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
                         'AppleWebKit/537.36 (KHTML, like Gecko) '\
                         'Chrome/75.0.3770.80 Safari/537.36',
          'Content-Type': 'application/json;charset=UTF-8'}

# Add headers
httpx = Session()
httpx.headers.update(headers)

In [3]:
URL = 'https://home.dk/umbraco/backoffice/home-api/Search'
params = dict(CurrentPageNumber=0, 
              SearchResultsPerPage=200)

In [4]:
r = httpx.get(URL, params=params)

In [5]:
data = r.json()

In [6]:
df = pd.DataFrame(data['searchResults'])
data['searchResultsPerPage'], data['totalSearchResults']

(200, 52338)

In [7]:
loops = data['totalSearchResults']/data['searchResultsPerPage']
loops, np.ceil(loops).astype(int)

(261.69, 262)

In [8]:
df.head()

Unnamed: 0,aabenthusNicename,aabenthusShowRegistration,adresse,andenmaegler,billedeUrl,boligKanLejes,boligOrGrundAreal,boligurl,city,ejendomstypePrimaerNicename,...,lng,openHouseEndDate,openHouseStartDate,overskrift2,pictures,postal,price,sagsnummer,showNewPrice,solgtBolig
0,,False,Sæbyvej 30,False,https://home.mindworking.eu/resources/shops/82...,0,148.0,https://home.dk/boligkatalog/broenderslev/9340...,Asaa,Villa,...,10.409491,,,Meget pæn tilstandsrapport!,"[{'PicId': 3357022, 'CaseId': 10535472, 'CaseN...",9340,648.000,8240000127,False,False
1,,False,Blåbærvej 8,False,https://home.mindworking.eu/resources/shops/62...,0,97.0,https://home.dk/boligkatalog/vejen/6600/huse-v...,Vejen,Villa,...,9.149376,,,Dejlig sydvendt have med hyggelig terrasse med...,"[{'PicId': 3025962, 'CaseId': 10402314, 'CaseN...",6600,845.000,6220000154,False,False
2,,False,"Kirstensvej 14, Lyngså",False,https://home.mindworking.eu/resources/shops/81...,0,75.0,https://home.dk/boligkatalog/frederikshavn/930...,Sæby,Fritidshus,...,10.539909,,,Sdr. Klit - Lyngså,"[{'PicId': 3348839, 'CaseId': 10536965, 'CaseN...",9300,845.000,8100000262,False,False
3,,False,Rørvigvej 115,False,https://home.mindworking.eu/resources/shops/21...,0,91.0,https://home.dk/boligkatalog/odsherred/4500/hu...,Nykøbing Sj,Villa,...,11.676701,,,Hyggeligt og velbygget hus på stor grund,"[{'PicId': 1788919, 'CaseId': 10003326, 'CaseN...",4500,995.000,219V00482,False,False
4,,False,"Enighedsvej 1, 2.",False,https://home.mindworking.eu/resources/shops/13...,0,70.0,https://home.dk/boligkatalog/helsingoer/3000/e...,Helsingør,Lejlighed,...,12.601889,,,Skøn villalejlighed kun få minutter fra Helsin...,"[{'PicId': 3350182, 'CaseId': 10535792, 'CaseN...",3000,1.350.000,1380000352,False,False


In [9]:
# Gearing up

In [10]:
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

In [11]:
class Home(ABC):

    def __init__(self, url, headers=None):
        
        session = Session()

        self.BASE_URL = url
        

        if headers is None:
            headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/39.0.2171.95 Safari/537.36'),
                    'Content-Type': 'application/json'}

        session.headers.update(headers)

        self.session = session
        self.meta_data = None
        self.store =  pd.DataFrame()

    
    def __repr__(self):
        return f'{self.__class__.__name__}(API={repr(self.BASE_URL)})'

    
    @abstractmethod
    def get_page(self, *args, **kwargs):
        pass
    
    
    def get_pages(self, *args, **kwargs):
        pass

In [12]:
class HomeRecent(Home):
    
    '''
    expects Base URL
    e.g. 
        url = 'https://home.dk/umbraco/backoffice/home-api/Search'      
    '''

    def get_page(self, page=0, pagesize=100 ,verbose=False):
        '''Gather Data From Home API
            page:int page number. default value 0
            pagesize:int number of boligs in a page. default value 100
            verbose:bool print mining progress. default value False
        '''
        
        params = {'CurrentPageNumber':page,
                 'SearchResultsPerPage':pagesize,
                 }
        

        r = self.session.get(self.BASE_URL, params=params)

        if r.ok:
            data = r.json()
            
            self.store = self.store.append(
                    pd.DataFrame(data.get('searchResults')), ignore_index=True)
            self.max_pages = loops = np.ceil(
                                data['totalSearchResults']/data['searchResultsPerPage']
                            ).astype(int)

        else:
            self.store
            
        if verbose:
            print(f'[+] Gathering data from page {page:}.{" ":>5}Found {len(self.store):>5} estates'
                 f'{" ":>3}Time {datetime.now().strftime("%d-%m-%Y %H:%M:%S")}')

        return self

    
    def get_pages(self, start_page=0, end_page=None, pagesize=100, workers=4, verbose=False):
        '''
         Parallel Gathering Data From Home
            start_page:int page number to start. default value 0
            end_page:int page number to stop. default value None
            pagesize:int number of boligs per page. default valeu 100
            verbose:bool print mining progress. default value False
        '''
        
        # Make the first call to get total number of pages for split call pagesize split
        
        self.get_page(page=start_page, pagesize=pagesize, verbose=verbose)
        
        if end_page is None:
            total_pages = self.max_pages
        else:
            total_pages = start_page + end_page + 1
        
        # since we got the first page, we can get the rest
        
        if start_page <= total_pages:
            start_page += 1

            func = lambda pages: [self.get_page(page, pagesize, verbose=verbose) for page in pages]
            pages_split = np.array_split(np.arange(start_page,total_pages+1), workers)
        
            with ThreadPoolExecutor(max_workers=workers) as executor:
                _ = [executor.submit(func,split) for split in pages_split]
        
        return self

In [13]:
# Play time

In [14]:
homes = HomeRecent(url='https://home.dk/umbraco/backoffice/home-api/Search')
homes

HomeRecent(API='https://home.dk/umbraco/backoffice/home-api/Search')

In [15]:
 # one call at a time
print('[+] Start single thread calls\n')
_ = [homes.get_page(page=page, pagesize=100, verbose=True) for page in range(0,10)]

[+] Start single thread calls

[+] Gathering data from page 0.     Found    99 estates   Time 11-05-2020 15:25:01
[+] Gathering data from page 1.     Found   198 estates   Time 11-05-2020 15:25:01
[+] Gathering data from page 2.     Found   294 estates   Time 11-05-2020 15:25:02
[+] Gathering data from page 3.     Found   392 estates   Time 11-05-2020 15:25:02
[+] Gathering data from page 4.     Found   486 estates   Time 11-05-2020 15:25:03
[+] Gathering data from page 5.     Found   585 estates   Time 11-05-2020 15:25:03
[+] Gathering data from page 6.     Found   681 estates   Time 11-05-2020 15:25:04
[+] Gathering data from page 7.     Found   777 estates   Time 11-05-2020 15:25:04
[+] Gathering data from page 8.     Found   875 estates   Time 11-05-2020 15:25:05
[+] Gathering data from page 9.     Found   970 estates   Time 11-05-2020 15:25:05


In [16]:
df = homes.store
print(f'Data Stored {df.shape[0]} rows\n')

Data Stored 970 rows



In [17]:
df.head(2)

Unnamed: 0,aabenthusNicename,aabenthusShowRegistration,adresse,andenmaegler,billedeUrl,boligKanLejes,boligOrGrundAreal,boligurl,city,ejendomstypePrimaerNicename,...,lng,openHouseEndDate,openHouseStartDate,overskrift2,pictures,postal,price,sagsnummer,showNewPrice,solgtBolig
0,,False,Sæbyvej 30,False,https://home.mindworking.eu/resources/shops/82...,0,148.0,https://home.dk/boligkatalog/broenderslev/9340...,Asaa,Villa,...,10.409491,,,Meget pæn tilstandsrapport!,"[{'PicId': 3357022, 'CaseId': 10535472, 'CaseN...",9340,648.0,8240000127,False,False
1,,False,Blåbærvej 8,False,https://home.mindworking.eu/resources/shops/62...,0,97.0,https://home.dk/boligkatalog/vejen/6600/huse-v...,Vejen,Villa,...,9.149376,,,Dejlig sydvendt have med hyggelig terrasse med...,"[{'PicId': 3025962, 'CaseId': 10402314, 'CaseN...",6600,845.0,6220000154,False,False


In [18]:
df.duplicated(['boligurl']).sum()

0

In [19]:
homes.max_pages

524

In [20]:
# multipe calls at once
workers = 5
print(f'[+] Start {workers} threads calls\n')
homes.get_pages(start_page=10, end_page=25, pagesize=100, workers=workers, verbose=True)

[+] Start 5 threads calls

[+] Gathering data from page 10.     Found  1070 estates   Time 11-05-2020 15:25:06
[+] Gathering data from page 11.     Found  1168 estates   Time 11-05-2020 15:25:06
[+] Gathering data from page 12.     Found  1268 estates   Time 11-05-2020 15:25:07
[+] Gathering data from page 13.     Found  1367 estates   Time 11-05-2020 15:25:07
[+] Gathering data from page 14.     Found  1466 estates   Time 11-05-2020 15:25:08
[+] Gathering data from page 17.     Found  1563 estates   Time 11-05-2020 15:25:08
[+] Gathering data from page 18.     Found  1662 estates   Time 11-05-2020 15:25:09
[+] Gathering data from page 19.     Found  1761 estates   Time 11-05-2020 15:25:09
[+] Gathering data from page 20.     Found  1860 estates   Time 11-05-2020 15:25:10
[+] Gathering data from page 21.     Found  1959 estates   Time 11-05-2020 15:25:10
[+] Gathering data from page 15.     Found  2058 estates   Time 11-05-2020 15:25:10
[+] Gathering data from page 27.     Found  2158 

HomeRecent(API='https://home.dk/umbraco/backoffice/home-api/Search')

In [21]:
print(f'Data Stored {homes.store.shape[0]} rows\n')

Data Stored 3642 rows

