# Setup

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import sqlite3 as sql
import pandas as pd
import time
from itertools import islice

### Defining functions that output car links for a given search

In [2]:
def getSoup(link):
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

In [3]:
def getAllLinks(link):
    soup = getSoup(link)
    tds = soup.findAll('div', {'class':'cldt-summary-titles'})
    return ['https://www.autoscout24.com/' + td.find('a')['href'] for td in tds]

# Generating Search Links

In [4]:
brands = ['audi','bmw', 'ford', 'mercedes-benz', 'opel', 'volkswagen', 'renault', '9ff', 'abarth', 'ac', 'acm', 'acura', 'aixam', 'alfa-romeo', 'alpina', 'alpine', 'amphicar', 'ariel-motor', 'artega',
'aspid', 'aston-martin', 'austin', 'autobianchi', 'auverland', 'baic', 'bedford', 'bellier', 'bentley', 'bolloré', 'borgward', 'brilliance', 'bugatti', 'buick', 'byd', 'cadillac', 'caravans-wohnm', 'casalini',
'caterham', 'changhe', 'chatenet', 'chery', 'chevrolet', 'chrysler', 'citroen', 'cityel', 'cmc', 'corvette', 'courb', 'cupra', 'dacia', 'daewoo', 'daf', 'daihatsu', 'daimler', 'dangel', 'de-tomaso',
'derways', 'dfsk', 'dodge', 'donkervoort', 'dr-motor', 'ds-automobiles', 'dutton', 'e.go', 'estrima', 'ferrari', 'fiat', 'fisker', 'gac-gonow', 'galloper', 'gaz', 'geely', 'gem', 'gemballa', 'genesis',
'gillet', 'giotti-victoria', 'gmc', 'goupil', 'great-wall', 'grevac', 'haima', 'hamann', 'haval', 'honda', 'hummer', 'hurtan', 'hyundai', 'infiniti', 'innocenti', 'iso-rivolta', 'isuzu', 'iveco', 'izh'
'jaguar', 'jeep', 'karabag', 'kia', 'koenigsegg', 'ktm', 'lada', 'lamborghini', 'lancia', 'land-rover', 'ldv', 'lexus', 'lifan', 'ligier', 'lincoln', 'lotus', 'mahindra', 'man', 'mansory', 'martin-motors', 'maserati', 'maxus', 'maybach',
'mazda', 'mclaren', 'melex', 'mg', 'microcar', 'minauto', 'mini', 'mitsubishi', 'mitsuoka', 'morgan', 'moskvich', 'mp-lafer', 'mpm-motors', 'nio', 'nissan', 'oldsmobile', 'oldtimer', 'pagani',
'panther-westwinds', 'peugeot', 'pgo', 'piaggio', 'plymouth', 'polestar', 'pontiac', 'proton', 'puch', 'qoros', 'qvale', 'ram', 'regis', 'reliant', 'renault', 'rolls-royce', 'rover', 'ruf', 'saab',
'santana', 'savel', 'sdg', 'seat', 'shuanghuan', 'skoda', 'smart', 'speedart', 'spyker', 'ssangyong', 'streetscooter', 'subaru', 'suzuki', 'tagaz', 'talbot', 'tasso', 'tata', 'tazzari-ev', 'techart', 'tesla',
'town-life', 'toyota', 'trabant', 'triumph', 'tvr', 'uaz', 'vanderhall', 'vaz', 'vem', 'volvo', 'vortex', 'wallys', 'wartburg', 'westfield', 'wiesmann', 'zastava', 'zaz', 'zhidou', 'zotye', 'others']

### Selecting brands with more than 10k cars for all geographies and prices ranges combined

In [5]:
brandselection = ['audi','bmw', 'ford', 'mercedes-benz', 'opel', 'volkswagen', 'alfa-romeo', 'citroen', 'dacia', 'fiat', 'honda', 'hyundai', 'jaguar', 'jeep', 'kia', 'lancia', 'land-rover', 'mazda', 'mini', 'mitsubishi', 'nissan',
'peugeot', 'renault', 'seat', 'skoda', 'smart', 'suzuki', 'toyota', 'volvo']

In [6]:
allbrandlinks = []
for brand in brandselection:
        brandlinks = 'https://www.autoscout24.com/lst/' + brand
        allbrandlinks.append(brandlinks) 
                
allbrandlinks

['https://www.autoscout24.com/lst/audi',
 'https://www.autoscout24.com/lst/bmw',
 'https://www.autoscout24.com/lst/ford',
 'https://www.autoscout24.com/lst/mercedes-benz',
 'https://www.autoscout24.com/lst/opel',
 'https://www.autoscout24.com/lst/volkswagen',
 'https://www.autoscout24.com/lst/alfa-romeo',
 'https://www.autoscout24.com/lst/citroen',
 'https://www.autoscout24.com/lst/dacia',
 'https://www.autoscout24.com/lst/fiat',
 'https://www.autoscout24.com/lst/honda',
 'https://www.autoscout24.com/lst/hyundai',
 'https://www.autoscout24.com/lst/jaguar',
 'https://www.autoscout24.com/lst/jeep',
 'https://www.autoscout24.com/lst/kia',
 'https://www.autoscout24.com/lst/lancia',
 'https://www.autoscout24.com/lst/land-rover',
 'https://www.autoscout24.com/lst/mazda',
 'https://www.autoscout24.com/lst/mini',
 'https://www.autoscout24.com/lst/mitsubishi',
 'https://www.autoscout24.com/lst/nissan',
 'https://www.autoscout24.com/lst/peugeot',
 'https://www.autoscout24.com/lst/renault',
 'htt

In [7]:
countrylist = ['A', 'B', 'D', 'E', 'F', 'I', 'L', 'NL']

In [9]:
allbrandcountrylinks = []
countries = ['A']
for country in countries:
    for link in allbrandlinks:
        brandcountrylinks = link + '?sort=price&desc=0&ustate=N%2CU&size=20&cy=' + country
        allbrandcountrylinks.append(brandcountrylinks) 
                
allbrandcountrylinks

['https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/bmw?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/ford?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/mercedes-benz?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/opel?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/volkswagen?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/alfa-romeo?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/citroen?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/dacia?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/fiat?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/honda?sort=price&desc=0&ustate=N%2CU&size=20&cy=A',
 'https://www.autoscout24.com/lst/hyunda

In [10]:
allbrandcountrypricelinks = []
prices = np.arange(0, 100000, 50).tolist()
for price in prices:
    for link in allbrandcountrylinks:
        brandcountrylinks = link + '&pricefrom=' + str(price) + '&priceto=' + str(price+49)
        allbrandcountrypricelinks.append(brandcountrylinks) 
                
allbrandcountrypricelinks

['https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/bmw?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/ford?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/mercedes-benz?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/opel?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/volkswagen?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/alfa-romeo?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/citroen?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'https://www.autoscout24.com/lst/dacia?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49',
 'ht

In [11]:
allbrandcountrypricepagelinks = []
pages = range(1,21)
for link in allbrandcountrypricelinks:
    for page in pages:
        brandcountrylinks = link + '&page=' + str(page)
        allbrandcountrypricepagelinks.append(brandcountrylinks) 
                         
allbrandcountrypricepagelinks

['https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=1',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=2',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=3',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=4',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=5',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=6',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=7',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy=A&pricefrom=0&priceto=49&page=8',
 'https://www.autoscout24.com/lst/audi?sort=price&desc=0&ustate=N%2CU&size=20&cy

In [12]:
len(allbrandcountrypricepagelinks)

1160000

### Saving searchlinks in SQL

In [16]:
database = 'Austriandatabase.db'
connection = sql.connect(database)

In [381]:
searchlinks = pd.DataFrame(allbrandcountrypricepagelinks)
searchlinks.to_sql('Austriansearchlinksupto100k', connection)

# Scrape and save carlinks by batches

### Read in search links from database

In [17]:
query = '''SELECT * from Austriansearchlinksupto100k'''
Austriansearchlinksupto100k = pd.read_sql_query(query, connection).iloc[:,1].values.tolist()

In [18]:
len(Austriansearchlinksupto100k)

1160000

### Scrape the searchlinks for carlinks and store batches

In [67]:
start = time.time()
allcarlinks = []
tracker = 0
iterator = iter(Austriansearchlinksupto100k)
for link in iterator:
    carlinks = getAllLinks(link)
    tracker = tracker + 1
    if not not carlinks:
        allcarlinks.extend(carlinks)
    if link[-2:] == '=2' and len(carlinks) !=20:
        next(islice(iterator, 17, 18), None)
        tracker = tracker + 18
    if tracker % 1000 == 0:
        autolinks = pd.DataFrame(allcarlinks)
        autolinks.to_sql('Austrianautolinksupto100k', connection, if_exists= 'append')
        allcarlinks = []
        
autolinks = pd.DataFrame(allcarlinks)
autolinks.to_sql('Austrianautolinksupto100k', connection, if_exists= 'append')
allcarlinks = []    
                
print(tracker)
end = time.time()
print(end - start)

[]
1160000
4573.749811172485


In [68]:
tracker

1160000