In [1]:
import numpy as np
import pandas as pd
from requests import Session

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\
                         'AppleWebKit/537.36 (KHTML, like Gecko) '\
                         'Chrome/75.0.3770.80 Safari/537.36',
          'Content-Type': 'application/json;charset=UTF-8'}

# Add headers
httpx = Session()
httpx.headers.update(headers)

In [3]:
URL = 'https://home.dk/umbraco/backoffice/home-api/Search'
params = dict(CurrentPageNumber=0, 
              SearchResultsPerPage=200)

In [4]:
r = httpx.get(URL, params=params)

In [5]:
r

<Response [200]>

In [6]:
data = r.json()

In [7]:
df = pd.DataFrame(data['searchResults'])
data['searchResultsPerPage'], data['totalSearchResults']

(200, 52340)

In [8]:
loops = data['totalSearchResults']/data['searchResultsPerPage']
loops, np.ceil(loops).astype(int)

(261.7, 262)

In [9]:
# Gearing up

In [10]:
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

In [11]:
class Bolig(ABC):

    def __init__(self, url, headers=None):
        
        session = Session()

        self.BASE_URL = url
        

        if headers is None:
            headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/39.0.2171.95 Safari/537.36'),
                    'Content-Type': 'application/json'}

        session.headers.update(headers)

        self.session = session
        self.meta_data = None
        self.store =  pd.DataFrame()

    
    def __repr__(self):
        return f'{self.__class__.__name__}(API={repr(self.BASE_URL)})'

    
    @abstractmethod
    def get_page(self, *args, **kwargs):
        pass
    
    
    @abstractmethod
    def get_pages(self, *args, **kwargs):
        pass

In [12]:
class Home(Bolig):
    
    '''
    expects Base URL
    e.g. 
        url = 'https://home.dk/umbraco/backoffice/home-api/Search'      
    '''

    def get_page(self, page=0, pagesize=100 ,verbose=False):
        '''Gather Data From Home API
            page:int page number. default value 0
            pagesize:int number of boligs in a page. default value 100
            verbose:bool print mining progress. default value False
        '''
        
        params = {'CurrentPageNumber':page,
                 'SearchResultsPerPage':pagesize,
                 }
        

        r = self.session.get(self.BASE_URL, params=params)

        if r.ok:
            data = r.json()
            
            self.store = self.store.append(
                    pd.DataFrame(data.get('searchResults')), ignore_index=True)
            self.max_pages = loops = np.ceil(
                                data['totalSearchResults']/data['searchResultsPerPage']
                            ).astype(int)

        else:
            self.store
            
        if verbose:
            print(f'[+] Gathering data from page {page:}.{" ":>5}Found {len(self.store):>5} estates'
                 f'{" ":>3}Time {datetime.now().strftime("%d-%m-%Y %H:%M:%S")}')

        return self

    
    def get_pages(self, start_page=0, end_page=None, pagesize=100, workers=4, verbose=False):
        '''
         Parallel Gathering Data From Home
            start_page:int page number to start. default value 0
            end_page:int page number to stop. default value None
            pagesize:int number of boligs per page. default valeu 100
            verbose:bool print mining progress. default value False
        '''
        
        # Make the first call to get total number of pages for split call pagesize split
        
        self.get_page(page=start_page, pagesize=pagesize, verbose=verbose)
        
        if end_page is None:
            total_pages = self.max_pages
        else:
            total_pages = start_page + end_page + 1
        
        # since we got the first page, we can get the rest
        
        if start_page <= total_pages:
            start_page += 1

            func = lambda pages: [self.get_page(page, pagesize, verbose=verbose) for page in pages]
            pages_split = np.array_split(np.arange(start_page,total_pages+1), workers)
        
            with ThreadPoolExecutor(max_workers=workers) as executor:
                _ = [executor.submit(func,split) for split in pages_split]
        
        return self

In [13]:
# Play time

In [14]:
homes = Home(url='https://home.dk/umbraco/backoffice/home-api/Search')
homes

Home(API='https://home.dk/umbraco/backoffice/home-api/Search')

In [15]:
 # one call at a time
print('[+] Start single thread calls\n')
_ = [homes.get_page(page=page, pagesize=15, verbose=True) for page in range(0,10)]

[+] Start single thread calls

[+] Gathering data from page 0.     Found    15 estates   Time 11-05-2020 16:33:17
[+] Gathering data from page 1.     Found    30 estates   Time 11-05-2020 16:33:17
[+] Gathering data from page 2.     Found    45 estates   Time 11-05-2020 16:33:17
[+] Gathering data from page 3.     Found    60 estates   Time 11-05-2020 16:33:18
[+] Gathering data from page 4.     Found    75 estates   Time 11-05-2020 16:33:18
[+] Gathering data from page 5.     Found    90 estates   Time 11-05-2020 16:33:18
[+] Gathering data from page 6.     Found   104 estates   Time 11-05-2020 16:33:18
[+] Gathering data from page 7.     Found   119 estates   Time 11-05-2020 16:33:18
[+] Gathering data from page 8.     Found   134 estates   Time 11-05-2020 16:33:19
[+] Gathering data from page 9.     Found   148 estates   Time 11-05-2020 16:33:19


In [16]:
df = homes.store
print(f'Data Stored {df.shape[0]} rows\n')

Data Stored 148 rows



In [17]:
df.head(2)

Unnamed: 0,aabenthusNicename,aabenthusShowRegistration,adresse,andenmaegler,billedeUrl,boligKanLejes,boligOrGrundAreal,boligurl,city,ejendomstypePrimaerNicename,...,lng,openHouseEndDate,openHouseStartDate,overskrift2,pictures,postal,price,sagsnummer,showNewPrice,solgtBolig
0,,False,Sæbyvej 30,False,https://home.mindworking.eu/resources/shops/82...,0,148,https://home.dk/boligkatalog/broenderslev/9340...,Asaa,Villa,...,10.409491,,,Meget pæn tilstandsrapport!,"[{'PicId': 3357022, 'CaseId': 10535472, 'CaseN...",9340,648.0,8240000127,False,False
1,,False,Blåbærvej 8,False,https://home.mindworking.eu/resources/shops/62...,0,97,https://home.dk/boligkatalog/vejen/6600/huse-v...,Vejen,Villa,...,9.149376,,,Dejlig sydvendt have med hyggelig terrasse med...,"[{'PicId': 3025962, 'CaseId': 10402314, 'CaseN...",6600,845.0,6220000154,False,False


In [18]:
df.duplicated(['boligurl']).sum()

0

In [19]:
# df = df.drop_duplicates(subset=['boligurl'])

In [20]:
homes.max_pages

3490

In [21]:
# multipe calls at once
workers = 5
print(f'[+] Start {workers} threads calls\n')
homes.get_pages(start_page=10, end_page=25, pagesize=15, workers=workers, verbose=True)

[+] Start 5 threads calls

[+] Gathering data from page 10.     Found   163 estates   Time 11-05-2020 16:33:19
[+] Gathering data from page 11.     Found   178 estates   Time 11-05-2020 16:33:19
[+] Gathering data from page 17.     Found   193 estates   Time 11-05-2020 16:33:20
[+] Gathering data from page 18.     Found   208 estates   Time 11-05-2020 16:33:21
[+] Gathering data from page 19.     Found   223 estates   Time 11-05-2020 16:33:22
[+] Gathering data from page 12.     Found   238 estates   Time 11-05-2020 16:33:22
[+] Gathering data from page 13.     Found   253 estates   Time 11-05-2020 16:33:23
[+] Gathering data from page 14.     Found   268 estates   Time 11-05-2020 16:33:24
[+] Gathering data from page 15.     Found   282 estates   Time 11-05-2020 16:33:25
[+] Gathering data from page 16.     Found   295 estates   Time 11-05-2020 16:33:26
[+] Gathering data from page 20.     Found   309 estates   Time 11-05-2020 16:33:27
[+] Gathering data from page 21.     Found   324 

Home(API='https://home.dk/umbraco/backoffice/home-api/Search')

In [22]:
print(f'Data Stored {homes.store.shape[0]} rows\n')

Data Stored 540 rows



In [23]:
# We can use the base class to get other apis

In [24]:
class BoligaRecent(Bolig):
    
    '''
    
    Request URL: https://api.boliga.dk/api/v2/search/results?pageSize=50&page=2

    expects Base URL
    e.g. 
        url = 'https://api.boliga.dk/api/v2/search/results'      
    '''

    def get_page(self, page=0, pagesize=100 ,verbose=False):
        '''Gather Data From Home API
            page:int page number. default value 0
            pagesize:int number of boligs in a page. default value 100
            verbose:bool print mining progress. default value False
        '''
        
        params = {'page':page,
                 'pageSize':pagesize,
                 }
        

        r = self.session.get(self.BASE_URL, params=params)

        if r.ok:
            data = r.json()
            
            self.store = self.store.append(
                    pd.DataFrame(data.get('results')), ignore_index=True)
            self.max_pages = data.get('totalPages')

        else:
            self.store
            
        if verbose:
            print(f'[+] Gathering data from page {page:}.{" ":>5}Found {len(self.store):>5} estates'
                 f'{" ":>3}Time {datetime.now().strftime("%d-%m-%Y %H:%M:%S")}')

        return self

    
    def get_pages(self, start_page=0, end_page=None, pagesize=100, workers=4, verbose=False):
        '''
         Parallel Gathering Data From Home
            start_page:int page number to start. default value 0
            end_page:int page number to stop. default value None
            pagesize:int number of boligs per page. default valeu 100
            verbose:bool print mining progress. default value False
        '''
        
        # Make the first call to get total number of pages for split call pagesize split
        
        self.get_page(page=start_page, pagesize=pagesize, verbose=verbose)
        
        if end_page is None:
            total_pages = self.max_pages
        else:
            total_pages = start_page + end_page + 1
        
        # since we got the first page, we can get the rest
        
        if start_page <= total_pages:
            start_page += 1

            func = lambda pages: [self.get_page(page, pagesize, verbose=verbose) for page in pages]
            pages_split = np.array_split(np.arange(start_page,total_pages+1), workers)
        
            with ThreadPoolExecutor(max_workers=workers) as executor:
                _ = [executor.submit(func,split) for split in pages_split]
        
        return self

In [25]:
boliga = BoligaRecent(url='https://api.boliga.dk/api/v2/search/results')

In [26]:
boliga.get_page(page=1, verbose=True)

[+] Gathering data from page 1.     Found   100 estates   Time 11-05-2020 16:33:42


BoligaRecent(API='https://api.boliga.dk/api/v2/search/results')

In [27]:
_ = [boliga.get_page(page=page, verbose=True) for page in range(1,5)]

[+] Gathering data from page 1.     Found   200 estates   Time 11-05-2020 16:33:43
[+] Gathering data from page 2.     Found   300 estates   Time 11-05-2020 16:33:44
[+] Gathering data from page 3.     Found   400 estates   Time 11-05-2020 16:33:45
[+] Gathering data from page 4.     Found   500 estates   Time 11-05-2020 16:33:46


[BoligaRecent(API='https://api.boliga.dk/api/v2/search/results'),
 BoligaRecent(API='https://api.boliga.dk/api/v2/search/results'),
 BoligaRecent(API='https://api.boliga.dk/api/v2/search/results'),
 BoligaRecent(API='https://api.boliga.dk/api/v2/search/results')]

In [28]:
len(boliga.store)

500

In [29]:
boliga.get_pages(start_page=6,end_page=10, pagesize=200, workers=6, verbose=True)

[+] Gathering data from page 6.     Found   700 estates   Time 11-05-2020 16:33:47
[+] Gathering data from page 9.     Found   900 estates   Time 11-05-2020 16:33:48
[+] Gathering data from page 15.     Found  1100 estates   Time 11-05-2020 16:33:49
[+] Gathering data from page 13.     Found  1100 estates   Time 11-05-2020 16:33:49
[+] Gathering data from page 11.     Found  1300 estates   Time 11-05-2020 16:33:49
[+] Gathering data from page 7.     Found  1500 estates   Time 11-05-2020 16:33:49
[+] Gathering data from page 17.     Found  1700 estates   Time 11-05-2020 16:33:49
[+] Gathering data from page 10.     Found  1900 estates   Time 11-05-2020 16:33:50
[+] Gathering data from page 16.     Found  2100 estates   Time 11-05-2020 16:33:50
[+] Gathering data from page 12.     Found  2300 estates   Time 11-05-2020 16:33:50
[+] Gathering data from page 14.     Found  2500 estates   Time 11-05-2020 16:33:50
[+] Gathering data from page 8.     Found  2700 estates   Time 11-05-2020 16:33

BoligaRecent(API='https://api.boliga.dk/api/v2/search/results')

In [30]:
len(boliga.store)

2700

In [31]:
dt = boliga.store

In [32]:
dt.dtypes

agentDisplayName            object
agentRegId                   int64
area                         int64
basementSize                 int64
buildYear                    int64
city                        object
createdDate                 object
dawaId                      object
daysForSale                  int64
domainId                     int64
downPayment                  int64
energyClass                 object
exp                          int64
floor                      float64
groupKey                    object
guid                        object
id                           int64
images                      object
inWatchlist                   bool
isActive                      bool
isForeclosure                 bool
isPremiumAgent                bool
itemType                     int64
latitude                   float64
longitude                  float64
lotSize                      int64
municipality                 int64
net                          int64
openHouse           