In [1]:
# Importing BeautifulSoup for parsing HTML and XML documents
from bs4 import BeautifulSoup 
# Importing the requests library to send HTTP requests and receive the response
import requests

In [2]:
# Defining the URL of the page to be scraped.
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_Kingdom'
# Using requests to fetch the content of the URL
web = requests.get(url)
# Parsing the HTML content of the page into a BeautifulSoup object for easy manipulation
soup = BeautifulSoup(web.text,'html')

In [3]:
print(soup)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of largest companies in the United Kingdom - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-li

When scraping web pages, it's common to need specific elements from large HTML documents. BeautifulSoup, a Python library for parsing HTML, offers two powerful methods to help us target and extract these elements:

1. **`find()` Method**:
   - **Purpose**: Retrieves the first occurrence of a tag that matches your search criteria.
   - **Use Case**: Ideal when you need a single item or the first item in a sequence, like a specific header, a title, or a particular table. In our example, we use this method to extract the "Fortune" list of companies, which is always the first table on the page.
   - **Example**: If a webpage contains multiple tables and the first table consistently contains the data we need, `find()` will efficiently give us just that table. This is demonstrated in our scraping task where the "Fortune" list is exactly what we target with `find()`.

2. **`find_all()` Method**:
   - **Purpose**: Returns a list of all tags that match the search criteria.
   - **Use Case**: Perfect for when you need to collect multiple items of the same type, such as all links in a document, all comments in a discussion thread, or multiple tables. Specifically, we use `find_all()` to extract all entries from the "Forbes" list, which is located in the second table on the page.
   - **Example**: If there are multiple instances of a structure (like entries in a list or rows in a table) that contain useful data, `find_all()` will capture them all for us to process. This approach is used in our scraping to handle the "Forbes" list, demonstrating how `find_all()` is suitable for collecting all similar items when more than one is present.

These methods can be fine-tuned by specifying attributes, searching by CSS class, or combining search parameters to narrow down the results precisely to our needs.


# Fortune List

In [4]:
# Finding the first table containing the Fortune list using 'find', which retrieves the first matching element
table = soup.find('table', class_= 'wikitable sortable')

In [5]:
print(table)

<table class="wikitable sortable">
<tbody><tr>
<th>Rank
</th>
<th>Fortune 500<br/>rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Revenue<br/><small>(USD millions)</small>
</th>
<th>Profits<br/><small>(USD millions)</small>
</th>
<th>Assets<br/><small>(USD millions)</small>
</th>
<th>Employees
</th>
<th>Headquarters
</th></tr>
<tr>
<td><span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/11px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/17px-Increase2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/22px-Increase2.svg.png 2x" width="11"/></span></span> 1
</td>
<td><span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src

In [6]:
fortune_title = table.find_all('th')
fortune_title

[<th>Rank
 </th>,
 <th>Fortune 500<br/>rank
 </th>,
 <th>Name
 </th>,
 <th>Industry
 </th>,
 <th>Revenue<br/><small>(USD millions)</small>
 </th>,
 <th>Profits<br/><small>(USD millions)</small>
 </th>,
 <th>Assets<br/><small>(USD millions)</small>
 </th>,
 <th>Employees
 </th>,
 <th>Headquarters
 </th>]

In [7]:
table_title = [title.text.strip() for title in fortune_title]
print(table_title)

['Rank', 'Fortune 500rank', 'Name', 'Industry', 'Revenue(USD millions)', 'Profits(USD millions)', 'Assets(USD millions)', 'Employees', 'Headquarters']


In [8]:
import pandas as pd

In [9]:
df_fortune = pd.DataFrame(columns = table_title)
df_fortune

Unnamed: 0,Rank,Fortune 500rank,Name,Industry,Revenue(USD millions),Profits(USD millions),Assets(USD millions),Employees,Headquarters


In [10]:
fortune_column = table.find_all('tr')


In [11]:
for row in fortune_column [1:]:
    fortune_row = row.find_all('td')
    fortune_individual_row = [data.text.strip() for data in fortune_row]
    length = len(df_fortune)
    df_fortune.loc[length] = fortune_individual_row

In [12]:
df_fortune

Unnamed: 0,Rank,Fortune 500rank,Name,Industry,Revenue(USD millions),Profits(USD millions),Assets(USD millions),Employees,Headquarters
0,1,15,Shell plc,Oil and Gas,272657,20101,404379,82000,London
1,2,35,BP,Oil and Gas,164195,7565,287272,65000,London
2,3,126,Tesco,Retail,84192,2031,66219,231223,Welwyn Garden City
3,4,149,HSBC,Banking,77330,13917,2957939,219697,London
4,5,198,Aviva,Insurance,64240,2703,485481,22062,London
5,6,201,Rio Tinto,Mining,63495,21094,102896,49345,London
6,7,203,Legal & General,Insurance,62504,2819,789066,10743,London
7,8,205,Unilever,Consumer goods,62006,7151,85383,148044,London
8,9,222,Lloyds Banking Group,Banking,58476,7954,1200620,57955,London
9,10,247,Vodafone,Telecommunication,52931,2424,170749,96941,Newbury


In [13]:
df_fortune.to_csv('uk_companies_fortune.csv', index = False)

# Forbes List

In [14]:
# Using 'find_all' to retrieve all tables, assuming the second one contains the Forbes list.
table2 = soup.find_all('table', class_= 'wikitable sortable')[1]
print(table2)

<table class="wikitable sortable" style="text-align:right;">
<tbody><tr>
<th align="center">Rank
</th>
<th align="center">Forbes <br/>2000 rank
</th>
<th align="center">Name
</th>
<th align="center">Headquarters
</th>
<th align="center">Revenue<br/>(billions <br/>US$)
</th>
<th align="center">Profit<br/>(billions <br/>US$)
</th>
<th align="center">Assets<br/>(billions <br/>US$)
</th>
<th align="center">Value<br/>(billions <br/>US$)
</th>
<th align="center">Industry
</th></tr>
<tr>
<td><span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/11px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/17px-Increase2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/22px-Increase2.svg.png 2x" width="11"/></span></span>1
</td>
<td><span typeof="mw:

In [15]:
forbes_title = table2.find_all('th')
forbes_title

[<th align="center">Rank
 </th>,
 <th align="center">Forbes <br/>2000 rank
 </th>,
 <th align="center">Name
 </th>,
 <th align="center">Headquarters
 </th>,
 <th align="center">Revenue<br/>(billions <br/>US$)
 </th>,
 <th align="center">Profit<br/>(billions <br/>US$)
 </th>,
 <th align="center">Assets<br/>(billions <br/>US$)
 </th>,
 <th align="center">Value<br/>(billions <br/>US$)
 </th>,
 <th align="center">Industry
 </th>]

In [16]:
forbes_table_title = [title.text.strip() for title in forbes_title]
print(forbes_table_title)

['Rank', 'Forbes 2000 rank', 'Name', 'Headquarters', 'Revenue(billions US$)', 'Profit(billions US$)', 'Assets(billions US$)', 'Value(billions US$)', 'Industry']


In [17]:
df_forbes = pd.DataFrame(columns = forbes_table_title)
df_forbes

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry


In [18]:
forbes_column = table2.find_all('tr')

In [19]:
for row in forbes_column [1:]:
    forbes_row = row.find_all('td')
    forbes_individual_row = [data.text.strip() for data in forbes_row]
    length2 = len(df_forbes)
    df_forbes.loc[length2] = forbes_individual_row

In [20]:
df_forbes

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry
0,1,16,Shell plc,London,261.76,20.27,404.38,211.1,Oil and Gas
1,2,38,HSBC,London,59.33,12.58,2957.94,135.3,Banking
2,3,50,BP,London,158.01,7.55,287.27,98.38,Oil and Gas
3,4,82,Rio Tinto,London,63.46,21.06,102.9,117.78,Mining
4,5,96,Lloyds Banking Group,London,58.48,7.36,1200.75,31.34,Banking
5,6,101,British American Tobacco,London,35.32,9.34,186.05,97.49,Consumer goods
6,7,125,Unilever,London,62.0,7.15,85.4,116.16,Consumer goods
7,8,129,GlaxoSmithKline,London,46.92,6.03,107.14,112.09,Pharmaceuticals
8,9,155,Barclays,London,30.17,8.77,1874.94,31.55,Banking
9,10,181,Anglo American,London,41.52,8.54,65.98,59.44,Mining


In [21]:
df_forbes.to_csv('uk_companies_forbes.csv', index = False)