In [5]:
#Import bs4
from bs4 import BeautifulSoup
import requests

In [6]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [None]:
print(soup.prettify())

In [21]:
table = soup.find_all('table')[0]

In [None]:
print(table.prettify())

In [31]:
#find the first 7 table headings
world_titles = table.find_all('th')[0:8]
#world_titles = table.find_all('th')
print(world_titles)

[<th rowspan="2" scope="col">Rank
</th>, <th rowspan="2" scope="col">Name
</th>, <th rowspan="2" scope="col">Industry
</th>, <th scope="col">Revenue
</th>, <th scope="col">Profit
</th>, <th rowspan="2" scope="col">Employees
</th>, <th rowspan="2" scope="col">Headquarters<sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[note 1]</a></sup>
</th>, <th rowspan="2" scope="col"><a href="/wiki/State-owned_enterprise" title="State-owned enterprise">State-owned</a>
</th>]


In [32]:
# loop through the table headings and print them out
world_table_titles = [title.text.strip() for title in world_titles]

print(world_table_titles)

['Rank', 'Name', 'Industry', 'Revenue', 'Profit', 'Employees', 'Headquarters[note 1]', 'State-owned']


In [35]:
# world table title headquarters has a sup tag, so we need to remove it
world_table_titles[6] = 'Headquarters'
print(world_table_titles)

['Rank', 'Name', 'Industry', 'Revenue', 'Profit', 'Employees', 'Headquarters', 'State-owned']


In [36]:
import pandas as pd

In [54]:
# create a dataframe with the column names
df = pd.DataFrame(columns=world_table_titles)
# print(df)
df

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters,State-owned


In [None]:
# find all the rows in the table
column_data = table.find_all('tr')[1:]
print(column_data)

In [150]:
# Assuming column_data contains the rows of the table
for row in column_data:
    # Check if the row contains the unwanted 'th' tags
    if row.find('th', {'colspan': '2', 'scope': 'col'}):
        continue # Skip this row and continue to the next row

    # Extracting the rank from the first column (th tag)
    rank = row.find('th').text.strip()

    # Extracting data from the rest of the columns (td tags)
    row_data = row.find_all('td')[0:7]

    # Extracting data from the image tag in the last column
    last_column_data = row_data[-1]
    image_tag = last_column_data.find('img')
    if image_tag:
        alt_text = image_tag.get('alt', '')
        title_text = image_tag.get('title', '')
        last_column_value = alt_text or title_text
    else:
        # If the last column is not an image, check if it's the 'State-owned' span
        state_owned_span = last_column_data.find('span', {'class': 'table-yes2'})
        if state_owned_span:
            last_column_value = state_owned_span.text.strip()
        else:
            last_column_value = last_column_data.text.strip()

    # Extracting data from the rest of the columns (td tags) (Excluding the last column)
    other_data = [data.text.strip() for data in row_data[:-1]]

    # Combining rank and other data into a single list
    world_table_data = [rank] + other_data + [last_column_value]

    #print(world_table_data)

    length = len(df)
    df.loc[length] = world_table_data


In [151]:
df

Unnamed: 0,Rank,Name,Industry,Revenue,Profit,Employees,Headquarters,State-owned
0,1,Walmart,Retail,"$611,289","$11,680",2100000,United States,No
1,2,Saudi Aramco,Oil and gas,"$603,651","$159,069",70496,Saudi Arabia,Yes
2,3,State Grid Corporation of China,Electricity,"$530,009","$8,192",870287,China,Yes
3,4,Amazon,Retail,"$513,983","−$2,722",1541000,United States,No
4,5,Vitol,Commodities,"$505,000","$15,000",1560,Switzerland,No
5,6,China National Petroleum Corporation,Oil and gas,"$483,019","$21,080",1087049,China,Yes
6,7,China Petrochemical Corporation,Oil and gas,"$471,154","$9,657",527487,China,Yes
7,8,ExxonMobil,Oil and gas,"$413,680","$55,740",63000,United States,No
8,9,Apple,Electronics,"$394,328","$99,803",164000,United States,No
9,10,Shell,Oil and gas,"$386,201","$20,120",93000,United Kingdom,No


In [None]:
df.to_csv('world_table.csv', index=False)