# Using BeautifulSoup and Pandas for Web Scraping

In [2]:
# Importung necessary libraries
from bs4 import BeautifulSoup  # For parsing HTML
import requests  # For making HTTP requests
import pandas as pd  # For data manipulation and analysis

In [3]:
# Defining the URL of the webpage to scrape
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_Africa_by_revenue'

In [4]:
# Sending a GET request to the URL
page = requests.get(url)

# Parse the HTML content of the page using Beautiful Soup
soup = BeautifulSoup(page.text, 'html.parser')  # Use 'html.parser' for better compatibility
print(soup.prettify())  # Print the prettified HTML for inspection

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of largest companies in Africa by revenue - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vecto

In [5]:
# Locating the specific table on the page that contains the desired data
# The second table on the page is the one we want to extract
table = soup.find_all('table')[1]  # Adjust index if necessary

# Alternatively, find the table by its class name
# table = soup.find('table', class_='wikitable sortable')

In [6]:
# Printing the table object to verify it was found correctly
print(table)

<table class="wikitable sortable">
<tbody><tr>
<th>Rank</th>
<th>Company</th>
<th>Industry</th>
<th>Revenue<br/>(US$ billions)</th>
<th width="150">Headquarters
</th></tr>
<tr>
<td>1</td>
<td><a href="/wiki/Sonatrach" title="Sonatrach">Sonatrach</a></td>
<td>Oil and gas</td>
<td>77.013</td>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/40px-Flag_of_Algeria.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/60px-Flag_of_Algeria.svg.png 2x" width="23"/></span></span> </span><a href="/wiki/Algeria" title="Algeria">Algeria</a>
</td></tr>
<tr>
<td>2</td>
<td><a href="/wiki/Eskom" title="Eskom">Eskom</a></td>
<td>Electric utility</td>
<td>13.941</td>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt

In [7]:
# Extracting the table headers (titles)
titles = table.find_all('th')
# Create a list of titles for the DataFrame
table_titles = [title.text.strip() for title in titles]

In [8]:
# Initialize a DataFrame with the extracted titles as columns
df = pd.DataFrame(columns=table_titles)

In [9]:
# Extracting all rows of the table
column_data = table.find_all('tr')

In [10]:
# Loop through each row in the table, skipping the header row
for row in column_data[1:]:
    # Find all data cells in the row
    row_data = row.find_all('td')
    # Extract text from each cell and strip whitespace
    individual_row_data = [x.text.strip() for x in row_data]
    # Append the row data to the DataFrame
    df.loc[len(df)] = individual_row_data  # Use len(df) to get the next index

df

Unnamed: 0,Rank,Company,Industry,Revenue(US$ billions),Headquarters
0,1,Sonatrach,Oil and gas,77.013,Algeria
1,2,Eskom,Electric utility,13.941,South Africa
2,3,Sasol,Chemistry,12.989,South Africa
3,4,MTN Group,Telecommunications,12.238,South Africa
4,5,Shoprite Holdings,Retail,10.802,South Africa
...,...,...,...,...,...
95,96,Blue Label Telecoms,Telecommunications,1.442,South Africa
96,97,Kibali Gold Mine,Mining,1.440,DR Congo
97,98,Aveng,Conglomerate,1.425,South Africa
98,99,Murray and Roberts Holdings,Construction,1.422,South Africa


In [22]:
# Saving the DataFrame to a CSV file
df.to_csv(r'C:\Users\user\Downloads\Web scraping\Companies_in_Africa.csv', index=False)