# A tutorial to scrape the web.

This example scrapes the BBC weather website for any specific city, and collects weather forecast for the next 14 days and saves it as a csv file.

*Web scraping might not be legal always. It is a good idea to check the terms of the website you plan to scrape before proceeding. Also, if your code requests a url from a server multiple times, it is a good practice to either cache your requests, or insert a timed delay between consecutive requests.*

In [2]:
import json                   # to convert API to json format

from urllib.parse import urlencode #cool way to pass location code for a city

import requests               # to get the webpage
from bs4 import BeautifulSoup # to parse the webpage line by line

import pandas as pd
import re                     # regular expression operators

from datetime import datetime

We now GET the webpage of interest, from the server

In [4]:
required_city = "Mumbai"
location_url = 'https://locator-service.api.bbci.co.uk/locations?' + urlencode({
   'api_key': 'AGbFAKx58hyjQScCXIYrxuEwJh2W2cmv',
   's': required_city,
   'stack': 'aws',
   'locale': 'en',
   'filter': 'international',
   'place-types': 'settlement,airport,district',
   'order': 'importance',
   'a': 'true',
   'format': 'json'
})
location_url

'https://locator-service.api.bbci.co.uk/locations?api_key=AGbFAKx58hyjQScCXIYrxuEwJh2W2cmv&s=Mumbai&stack=aws&locale=en&filter=international&place-types=settlement%2Cairport%2Cdistrict&order=importance&a=true&format=json'

In [5]:
result = requests.get(location_url).json()
result

{'response': {'results': {'results': [{'id': '1275339',
     'name': 'Mumbai',
     'container': 'India',
     'containerId': 1269750,
     'language': 'en',
     'timezone': 'Asia/Kolkata',
     'country': 'IN',
     'latitude': 19.07283,
     'longitude': 72.88261,
     'placeType': 'settlement'}],
   'totalResults': 1}}}

In [7]:
# url      = 'https://www.bbc.com/weather/1275339' # url to BBC weather, corresponding to a specific city (Mumbai, in this example)
url      = 'https://www.bbc.com/weather/'+result['response']['results']['results'][0]['id']
response = requests.get(url)

Next, we initiate an instance of  BeautifulSoup.

In [8]:
soup = BeautifulSoup(response.content,'html.parser')

The information we want (daily high and low temp., and daily weather summary), are in specific blocks on the webpage.
We need to find the block type, type of identifier, and the identifier name (all these can be figured out by right clicking
on the webpage and selecting 'Inspect' on the Chrome browser; similar modus operandi for other browsers)

In [9]:
daily_high_values = soup.find_all('span', attrs={'class': 'wr-day-temperature__high-value'}) # block-type: span; identifier type: class; and class name: wr-day-temperature__high-value
daily_high_values

[<span class="wr-day-temperature__high-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">28°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">82°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">28°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">82°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">28°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">83°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">28°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">83°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature"><span class="w

In [10]:
daily_low_values  = soup.find_all('span', attrs={'class': 'wr-day-temperature__low-value'})
daily_low_values

[<span class="wr-day-temperature__low-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">25°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">77°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">25°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">78°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">26°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">78°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature"><span class="wr-value--temperature--c">26°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">78°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature"><span class="wr-val

In [15]:
daily_summary = soup.find('div', attrs={'class': 'wr-day-summary'})
daily_summary

<div class="wr-day-summary"><div class="gel-wrap"><span class="">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span></div></div>

In [16]:
daily_summary.text

'Thundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breeze'

General book keeping.

With the code snippet in the cell above, we get forecast data for 14 days, including today. We will now post process the data to first extract the required information/text and discard all the html wrapper code, then combine all variables into one common list, and finally convert it into a pandas data frame.

In [26]:
#Day 1 temperature
#daily_high_values[0]
#daily_high_values[0].text
daily_high_values[0].text.strip()

'28° 82°'

In [14]:
daily_high_values[5].text.strip()

'29° 83°'

In [28]:
daily_high_values[0].text.strip().split()[0]

'28°'

In [29]:
daily_high_values_list = [daily_high_values[i].text.strip().split()[0] for i in range(len(daily_high_values))]
daily_high_values_list

['28°',
 '28°',
 '28°',
 '28°',
 '29°',
 '29°',
 '29°',
 '29°',
 '29°',
 '29°',
 '29°',
 '29°',
 '29°',
 '29°']

In [35]:
daily_low_values_list = [daily_low_values[i].text.strip().split()[0] for i in range(len(daily_low_values))]
daily_low_values_list

['25°',
 '25°',
 '26°',
 '26°',
 '26°',
 '25°',
 '25°',
 '25°',
 '25°',
 '25°',
 '25°',
 '25°',
 '25°',
 '25°']

In [30]:
daily_summary.text

'Thundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breeze'

In [31]:
daily_summary_list = re.findall('[a-zA-Z][^A-Z]*', daily_summary.text) #split the string on uppercase
daily_summary_list

['Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze']

In [32]:
datelist = pd.date_range(datetime.today(), periods=len(daily_high_values)).tolist()
datelist

[Timestamp('2024-07-20 07:17:25.663406'),
 Timestamp('2024-07-21 07:17:25.663406'),
 Timestamp('2024-07-22 07:17:25.663406'),
 Timestamp('2024-07-23 07:17:25.663406'),
 Timestamp('2024-07-24 07:17:25.663406'),
 Timestamp('2024-07-25 07:17:25.663406'),
 Timestamp('2024-07-26 07:17:25.663406'),
 Timestamp('2024-07-27 07:17:25.663406'),
 Timestamp('2024-07-28 07:17:25.663406'),
 Timestamp('2024-07-29 07:17:25.663406'),
 Timestamp('2024-07-30 07:17:25.663406'),
 Timestamp('2024-07-31 07:17:25.663406'),
 Timestamp('2024-08-01 07:17:25.663406'),
 Timestamp('2024-08-02 07:17:25.663406')]

In [33]:
datelist = [datelist[i].date().strftime('%y-%m-%d') for i in range(len(datelist))]
datelist

['24-07-20',
 '24-07-21',
 '24-07-22',
 '24-07-23',
 '24-07-24',
 '24-07-25',
 '24-07-26',
 '24-07-27',
 '24-07-28',
 '24-07-29',
 '24-07-30',
 '24-07-31',
 '24-08-01',
 '24-08-02']

In [36]:
zipped = zip(datelist, daily_high_values_list, daily_low_values_list, daily_summary_list)

In [38]:
df = pd.DataFrame(list(zipped), columns=['Date', 'High','Low', 'Summary'])

In [39]:
display(df)

Unnamed: 0,Date,High,Low,Summary


In [None]:
# remove the 'degree' character
df.High = df.High.replace('\°','',regex=True).astype(float)
df.Low  = df.Low.replace('\°','',regex=True).astype(float)

In [None]:
display(df)

Unnamed: 0,Date,High,Low,Summary
0,24-06-01,33.0,28.0,Sunny and a gentle breeze
1,24-06-02,34.0,27.0,Drizzle and a gentle breeze
2,24-06-03,33.0,28.0,Light rain showers and a gentle breeze
3,24-06-04,33.0,28.0,Light rain showers and a gentle breeze
4,24-06-05,33.0,27.0,Light rain showers and a gentle breeze
5,24-06-06,33.0,27.0,Light rain and a gentle breeze
6,24-06-07,33.0,28.0,Sunny intervals and a gentle breeze
7,24-06-08,33.0,28.0,Light rain showers and a gentle breeze
8,24-06-09,33.0,27.0,Thundery showers and a gentle breeze
9,24-06-10,33.0,28.0,Thundery showers and a gentle breeze


Extract the name of the city for which data is gathered.

In [41]:
#location = soup.find('div', attrs={'class':'wr-c-location'})
location = soup.find('h1', attrs={'id':'wr-location-name-id'})
location.text.split()



In [42]:
# create a recording
filename_csv = location.text.split()[0]+'.csv'
df.to_csv(filename_csv, index=None)

In [47]:
filename_xlsx = location.text.split()[0]+'.xlsx'
df.to_excel(filename_xlsx)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     0 non-null      object
 1   High     0 non-null      object
 2   Low      0 non-null      object
 3   Summary  0 non-null      object
dtypes: object(4)
memory usage: 124.0+ bytes
