In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_Africa_by_revenue"

# Send HTTP request to the URL
r= requests.get(url)

print(r.status_code) # Print the HTTP status code to check if the request was successful (200 means OK)


200


In [3]:
soup = BeautifulSoup(r.text, 'html.parser')  # Parse the HTML content of the page using BeautifulSoup

print(soup)

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of largest companies in Africa by revenue - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited

In [4]:
# Find all tables in the page
tables = soup.find_all('table')
tables

[<table class="box-One_source plainlinks metadata ambox ambox-content ambox-one_source" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Question_book-new.svg"><img class="mw-file-element" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/60px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/120px-Question_book-new.svg.png 1.5x" width="50"/></a></span></div></td><td class="mbox-text"><div class="mbox-text-span">This article <b>relies largely or entirely on a <a href="/wiki/Wikipedia:Articles_with_a_single_source" title="Wikipedia:Articles with a single source">single source</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=List_of_largest_companies_in_Africa

In [5]:
table = tables[1]  # Select the second table from the list of all tables on the page (indexing starts at 0)
table

<table class="wikitable sortable">
<tbody><tr>
<th>Rank</th>
<th>Company</th>
<th>Industry</th>
<th>Revenue<br/>(US$ billions)</th>
<th width="150">Headquarters
</th></tr>
<tr>
<td>1</td>
<td><a href="/wiki/Sonatrach" title="Sonatrach">Sonatrach</a></td>
<td>Oil and gas</td>
<td>77.013</td>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/40px-Flag_of_Algeria.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/60px-Flag_of_Algeria.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/Algeria" title="Algeria">Algeria</a>
</td></tr>
<tr>
<td>2</td>
<td><a href="/wiki/Eskom" title="Eskom">Eskom</a></td>
<td>Electric utility</td>
<td>13.941</td>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt

In [6]:
# #Finding the table with the class is also valid
# soup.find('table', class_ = 'wikitable sortable')

In [7]:
# Extract table headers
headers_plus_tags = table.find_all('th')
headers_plus_tags

[<th>Rank</th>,
 <th>Company</th>,
 <th>Industry</th>,
 <th>Revenue<br/>(US$ billions)</th>,
 <th width="150">Headquarters
 </th>]

In [8]:
table_headers = [headers.text.strip() for headers in headers_plus_tags] # Extract and clean text from all header tags in the table using list comprehension
table_headers

['Rank', 'Company', 'Industry', 'Revenue(US$ billions)', 'Headquarters']

In [9]:
df = pd.DataFrame(columns=table_headers) # Create an empty DataFrame with the extracted table headers as column names
df

Unnamed: 0,Rank,Company,Industry,Revenue(US$ billions),Headquarters


In [10]:
# Extract rows from the table
rows = table.find_all("tr")
rows

[<tr>
 <th>Rank</th>
 <th>Company</th>
 <th>Industry</th>
 <th>Revenue<br/>(US$ billions)</th>
 <th width="150">Headquarters
 </th></tr>,
 <tr>
 <td>1</td>
 <td><a href="/wiki/Sonatrach" title="Sonatrach">Sonatrach</a></td>
 <td>Oil and gas</td>
 <td>77.013</td>
 <td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/40px-Flag_of_Algeria.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/60px-Flag_of_Algeria.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/Algeria" title="Algeria">Algeria</a>
 </td></tr>,
 <tr>
 <td>2</td>
 <td><a href="/wiki/Eskom" title="Eskom">Eskom</a></td>
 <td>Electric utility</td>
 <td>13.941</td>
 <td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-e

In [11]:
# Extract table data values

for row in rows[1:]:
    row_data = row.find_all("td")
    row_data_values = [data.text.strip() for data in row_data]
    
    length = len(df) #This gets the current number of rows in df so the next row can be added at the end
    print(length)
    df.loc[length] = row_data_values #This appends a new row to the DataFrame.
   

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [12]:
df

Unnamed: 0,Rank,Company,Industry,Revenue(US$ billions),Headquarters
0,1,Sonatrach,Oil and gas,77.013,Algeria
1,2,Eskom,Electric utility,13.941,South Africa
2,3,Sasol,Chemistry,12.989,South Africa
3,4,MTN Group,Telecommunications,12.238,South Africa
4,5,Shoprite Holdings,Retail,10.802,South Africa
...,...,...,...,...,...
95,96,Blue Label Telecoms,Telecommunications,1.442,South Africa
96,97,Kibali Gold Mine,Mining,1.440,DR Congo
97,98,Aveng,Conglomerate,1.425,South Africa
98,99,Murray and Roberts Holdings,Construction,1.422,South Africa


In [13]:
#Saving as csv
df.to_csv(r'C:\Users\DELL\Web-Scraping\Web-Scraping\Africa Companies Ranking.csv', index = False)