In [94]:
'''
Web Scraping Using Requests and BeautifulSoup
    Aim to show how we ought not ninja HTML
    Design to show common pattern that we do not have to follow as 
    Network offers discoverable APIs see bilbase_api.py
    e.g. https://www.bilbasen.dk/api/newest?page=1&pagesize=100
'''

from collections import defaultdict
import json
import re


from bs4 import BeautifulSoup
from requests import Session
import pandas as pd


        
session = Session()

BASE_URL = 'https://www.biltorvet.dk'

SEARCH = 'Search/SearchResult'



headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/39.0.2171.95 Safari/537.36'),
            'Content-Type': 'application/json'}

session.headers.update(headers)

In [95]:
payload = {"searchCriterias":
     [{"type":"ProductType",
       "range":None,"value":"1"
      }
     ],
 "filterObject":
     {"sortingObject":None,
      "viewStyleObject":
          {"viewStyle":"Tile"}
     },
 "pageNumber":2,
 "searchOrigin":0,
 #"viewId":"5bbcaa8c-59e9-476d-83a1-a5c21945d4a6",
 "shouldRedirect":False,
 "save":True}

In [96]:
# payload = {"filterObject":
#                {"sortingObject":
#                     {"sort":"Asc","order":"Alphabetic","isOpen":false},
#                 "viewStyleObject":
#                     {"viewStyle":"tile"}
#                },
#            "searchOrigin":0,
#            "viewId":"5bbcaa8c-59e9-476d-83a1-a5c21945d4a6"}

r = session.post(f'{BASE_URL}/{SEARCH}', data=json.dumps(payload))

In [97]:
r.reason

'OK'

In [98]:
soup = BeautifulSoup(r.content, 'lxml')

In [99]:
data = soup.find_all(class_=re.compile('content'))

In [46]:
data[0].find(class_=re.compile('price')).get_text(strip=True)

'166.500 kr.'

In [45]:
data[0].find(class_=re.compile('text')).get_text(strip=True, separator='\n' ).split('\n')

['166.500 kr.', 'Mazda 2', '1,5 Skyactiv-G Sky 90HK 5d 6g']

In [54]:
data[0].find(class_=re.compile('details')).get_text(strip=True, separator='\n').split('\n')

['Næstved', '1.000', '2020']

In [66]:
data[0].parent.img['alt']

'Mazda 2 1,5 Skyactiv-G Sky 90HK 5d 6g'

In [68]:
data[0].parent.img['title']

'Mazda 2 1,5 Skyactiv-G Sky 90HK 5d 6g'

In [71]:
data[0].parent.get_text(strip=True, separator='\n' ).split('\n')

['166.500 kr.',
 'Mazda 2',
 '1,5 Skyactiv-G Sky 90HK 5d 6g',
 'Næstved',
 '1.000',
 '2020']

In [92]:
data[0].parent['href']

'/bil/mazda/2/1-5-skyactiv-g-sky-90hk-5d-6g/1526250'

In [104]:
# it has application/ld+json

car_url = data[0].parent['href'] 
car = session.get(f'{BASE_URL}{car_url}')
print(car)
car_soup = BeautifulSoup(car.content, 'lxml')

<Response [200]>


In [128]:
json.loads(car_soup.select('script[type="application/ld+json"]')[0].contents[0])

{'@context': 'https://schema.org',
 '@type': 'Car',
 'url': 'https://www.biltorvet.dk/bil/peugeot/307/1-6-110hk-stc/1503952',
 'image': 'https://picture.biltorvet.dk/product/0af48d875a7b49e29cc9ec4b37155c46.jpg?pid=1508709&width=564&height=320&mode=crop&scale=both',
 'brand': {'@context': 'https://schema.org',
  '@type': 'Brand',
  'logo': 'https://picture.biltorvet.dk/img/maerkelogo/peugeot.png?height=80',
  'name': 'Peugeot',
  'url': 'https://www.biltorvet.dk//peugeot'},
 'offers': {'@context': 'https://schema.org',
  '@type': 'Offer',
  'priceCurrency': 'DKK',
  'price': '9900'},
 'vehicleEngine': {'@context': 'https://schema.org',
  '@type': 'EngineSpecification',
  'engineDisplacement': {'@type': 'QuantitativeValue',
   'value': '1587',
   'unitCode': 'CMQ'},
  'enginePower': {'@type': 'QuantitativeValue',
   'value': '110',
   'unitCode': 'N12'},
  'fuelType': 'Benzin'},
 'name': 'Peugeot 307 1,6 110HK Stc',
 'description': '¤ Ved kommissionssalg er prisen lav, vi har ikke samme

# How I found things

In [58]:
print(soup.prettify())

<html>
 <body>
  <div class="search-result" data-url="/soeg/16053787" id="d4ef792f-5855-4507-88de-56af32052768">
   <script type="text/javascript">
    $(document).ready(function () { new SearchResult('d4ef792f-5855-4507-88de-56af32052768', false); })
   </script>
   <div class="layout--grey">
    <div class="layout--overflow-container">
     <div class="search-result__header">
      <span class="search-result__header-text">
       Din søgning gav 31.370 resultater
      </span>
      <span class="search-result__sub-text">
      </span>
     </div>
    </div>
   </div>
   <script type="text/javascript">
    $(document).ready(function () { new AdsInFocus('1caa5efb-87a8-477c-b749-5172ac9a6981'); });
   </script>
   <div class="layout--overflow-container layout--lightest-grey scroll-container" id="1caa5efb-87a8-477c-b749-5172ac9a6981">
    <h4 class="ads-in-focus__title">
     Annoncer i fokus
    </h4>
    <div class="scroll-container__arrow scroll-container__arrow--left js-arrow-left">


In [105]:
print(car_soup.prettify())

<!DOCTYPE html>
<html lang="da">
 <head>
  <meta content="width=device-width, initial-scale=1, user-scalable=0" name="viewport"/>
  <meta charset="utf-8"/>
  <script data-culture="DA" id="CookieConsent" src="https://policy.app.cookieinformation.com/uc.js" type="text/javascript">
  </script>
  <script>
   dataLayer = [{ gaUserId: '19eac7931825b2bbfdbe4527ed228bd6' }];
  </script>
  <script>
   var gtmId = 'GTM-WVTJ4W';
  </script>
  <script>
   window.addEventListener('CookieInformationConsentGiven', function (event) {
            if (CookieInformation.getConsentGivenFor('cookie_cat_marketing')) {

                (function (w, d, s, l, i) {
                    w[l] = w[l] || [];
                    w[l].push({
                        'gtm.start':
                            new Date().getTime(),
                        event: 'gtm.js'
                    });
                    var f = d.getElementsByTagName(s)[0],
                        j = d.createElement(s),
                       