# Web scraping Car URLS

In [1]:
# we will scrape urls from a website that are related to cars

# Requests for handling HTTP get and other requests
import requests
import time # import for playing nice and not getting blocked
import pandas as pd
# from BeautifulSoup4 import BeatifulSoup if installed through pip install BeautifulSoup4
# 
from bs4 import BeautifulSoup
import plotly.express as px

In [2]:
url = "https://www.ss.com/lv/transport/cars/"
response = requests.get(url)
print(response.status_code)

200


In [4]:
# first we parse the response with BeautifulSoup
soup = BeautifulSoup(response.text, 'lxml')
# title
print(soup.title)

<title>SS.COM Transports - Vieglie auto - Sludinājumi</title>


In [5]:
# now the urls are in a table that has align center property
# we can use this to find the table
table = soup.find('table', attrs={'align':'center'})
# print table
print(table)

<table align="center" border="0" cellpadding="0" cellspacing="0" style="padding-left:1px;" width="100%"><tr>
<td align="left" nowrap="" valign="top" width="25%">
<table border="0" cellpadding="0" cellspacing="0">
<tr><td align="left" nowrap=""><h4 class="category" id="sc_99"><a class="a_category" href="/lv/transport/cars/alfa-romeo/" id="ahc_99" title="Alfa Romeo, Sludinājumi">Alfa Romeo</a></h4>
<h4 class="category" id="sc_103"><a class="a_category" href="/lv/transport/cars/audi/" id="ahc_103" title="Audi, Sludinājumi">Audi</a></h4>
<h4 class="category" id="sc_106"><a class="a_category" href="/lv/transport/cars/bmw/" id="ahc_106" title="BMW, Sludinājumi">BMW</a></h4>
<h4 class="category" id="sc_110"><a class="a_category" href="/lv/transport/cars/chevrolet/" id="ahc_110" title="Chevrolet, Sludinājumi">Chevrolet</a></h4>
<h4 class="category" id="sc_111"><a class="a_category" href="/lv/transport/cars/chrysler/" id="ahc_111" title="Chrysler, Sludinājumi">Chrysler</a></h4>
<h4 class="categ

In [7]:
# now let us get all anchors from the table
anchors = table.find_all('a')
# print anchors
print(anchors)

[<a class="a_category" href="/lv/transport/cars/alfa-romeo/" id="ahc_99" title="Alfa Romeo, Sludinājumi">Alfa Romeo</a>, <a class="a_category" href="/lv/transport/cars/audi/" id="ahc_103" title="Audi, Sludinājumi">Audi</a>, <a class="a_category" href="/lv/transport/cars/bmw/" id="ahc_106" title="BMW, Sludinājumi">BMW</a>, <a class="a_category" href="/lv/transport/cars/chevrolet/" id="ahc_110" title="Chevrolet, Sludinājumi">Chevrolet</a>, <a class="a_category" href="/lv/transport/cars/chrysler/" id="ahc_111" title="Chrysler, Sludinājumi">Chrysler</a>, <a class="a_category" href="/lv/transport/cars/citroen/" id="ahc_112" title="Citroen, Sludinājumi">Citroen</a>, <a class="a_category" href="/lv/transport/cars/cupra/" id="ahc_292552" title="Cupra, Sludinājumi">Cupra</a>, <a class="a_category" href="/lv/transport/cars/dacia/" id="ahc_75068" title="Dacia, Sludinājumi">Dacia</a>, <a class="a_category" href="/lv/transport/cars/dodge/" id="ahc_116" title="Dodge, Sludinājumi">Dodge</a>, <a class

In [9]:
# get the href attribute from the anchor
prefix = 'https://www.ss.com'
urls = []
for a in anchors:
    urls.append(prefix + a['href'])
print(*urls, sep='\n')

https://www.ss.com/lv/transport/cars/alfa-romeo/
https://www.ss.com/lv/transport/cars/audi/
https://www.ss.com/lv/transport/cars/bmw/
https://www.ss.com/lv/transport/cars/chevrolet/
https://www.ss.com/lv/transport/cars/chrysler/
https://www.ss.com/lv/transport/cars/citroen/
https://www.ss.com/lv/transport/cars/cupra/
https://www.ss.com/lv/transport/cars/dacia/
https://www.ss.com/lv/transport/cars/dodge/
https://www.ss.com/lv/transport/cars/fiat/
https://www.ss.com/lv/transport/cars/ford/
https://www.ss.com/lv/transport/cars/honda/
https://www.ss.com/lv/transport/cars/hyundai/
https://www.ss.com/lv/transport/cars/infiniti/
https://www.ss.com/lv/transport/cars/jaguar/
https://www.ss.com/lv/transport/cars/jeep/
https://www.ss.com/lv/transport/cars/kia/
https://www.ss.com/lv/transport/cars/lancia/
https://www.ss.com/lv/transport/cars/land-rover/
https://www.ss.com/lv/transport/cars/lexus/
https://www.ss.com/lv/transport/cars/mazda/
https://www.ss.com/lv/transport/cars/mercedes/
https://www

In [10]:
# now let us create a function that will take a url and return a list of urls
def get_urls(url):
    # create prefix from url to append to href
    prefix = url.split('/lv/')[0]
    response = requests.get(url)
    # check if the request was successful
    if response.status_code != 200:
        print(f"Failed to get {url} - status code {response.status_code}")
        return []
    soup = BeautifulSoup(response.text, 'lxml')
    table = soup.find('table', attrs={'align':'center'})
    # check if the table was found
    if table is None:
        print(f"No table found in {url}")
        return []
    anchors = table.find_all('a')
    urls = []
    for a in anchors:
        urls.append(prefix + a['href'])
    return urls

In [11]:
# check if the function works
urls = get_urls(url)
print(*urls, sep='\n')

https://www.ss.com/lv/transport/cars/alfa-romeo/
https://www.ss.com/lv/transport/cars/audi/
https://www.ss.com/lv/transport/cars/bmw/
https://www.ss.com/lv/transport/cars/chevrolet/
https://www.ss.com/lv/transport/cars/chrysler/
https://www.ss.com/lv/transport/cars/citroen/
https://www.ss.com/lv/transport/cars/cupra/
https://www.ss.com/lv/transport/cars/dacia/
https://www.ss.com/lv/transport/cars/dodge/
https://www.ss.com/lv/transport/cars/fiat/
https://www.ss.com/lv/transport/cars/ford/
https://www.ss.com/lv/transport/cars/honda/
https://www.ss.com/lv/transport/cars/hyundai/
https://www.ss.com/lv/transport/cars/infiniti/
https://www.ss.com/lv/transport/cars/jaguar/
https://www.ss.com/lv/transport/cars/jeep/
https://www.ss.com/lv/transport/cars/kia/
https://www.ss.com/lv/transport/cars/lancia/
https://www.ss.com/lv/transport/cars/land-rover/
https://www.ss.com/lv/transport/cars/lexus/
https://www.ss.com/lv/transport/cars/mazda/
https://www.ss.com/lv/transport/cars/mercedes/
https://www