In [25]:
import pandas as pd
from bs4 import BeautifulSoup
import requests



In [26]:
# Step 1: Scrape the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"
headers = {
    "User-Agent": "MyWebScraper/1.0 (https://github.com/Solidx74; kareebsadab@gmail.com)"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")


In [27]:
# Step 2: Select the main companies table (first table on the page)
table = soup.find_all('table')[0]



In [28]:
table

<table class="wikitable sortable">
<caption>
</caption>
<tbody><tr>
<th>Rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Revenue <br/>(USD millions)
</th>
<th>Revenue growth
</th>
<th>Employees
</th>
<th>Headquarters
</th></tr>
<tr>
<td>1
</td>
<td><a href="/wiki/Walmart" title="Walmart">Walmart</a>
</td>
<td>Retail
</td>
<td style="text-align:center;">680,985
</td>
<td style="text-align:center;"><span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/20px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/40px-Increase2.svg.png 2x" width="11"/></span></span> <span data-sort-value="7000300000000000000♠" style="display:none"></span> 5.1%
</td>
<td style="text-align:center;">2,100,000
</td>
<td><a href="/wiki/Bentonville,_Arkansas" title="Bentonville, Arkansas">Bentonvil

In [29]:
# Step 3: Extract column headers from the first table only
world_table_titles = [th.text.strip() for th in table.find_all('th')]


In [30]:
world_table_titles

['Rank',
 'Name',
 'Industry',
 'Revenue (USD millions)',
 'Revenue growth',
 'Employees',
 'Headquarters']

In [31]:
# Step 4: Create an empty DataFrame with the correct columns
df = pd.DataFrame(columns=world_table_titles)

In [32]:
# Step 5: Find all table rows
column_data = table.find_all('tr')

In [33]:
column_data

[<tr>
 <th>Rank
 </th>
 <th>Name
 </th>
 <th>Industry
 </th>
 <th>Revenue <br/>(USD millions)
 </th>
 <th>Revenue growth
 </th>
 <th>Employees
 </th>
 <th>Headquarters
 </th></tr>,
 <tr>
 <td>1
 </td>
 <td><a href="/wiki/Walmart" title="Walmart">Walmart</a>
 </td>
 <td>Retail
 </td>
 <td style="text-align:center;">680,985
 </td>
 <td style="text-align:center;"><span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/20px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/40px-Increase2.svg.png 2x" width="11"/></span></span> <span data-sort-value="7000300000000000000♠" style="display:none"></span> 5.1%
 </td>
 <td style="text-align:center;">2,100,000
 </td>
 <td><a href="/wiki/Bentonville,_Arkansas" title="Bentonville, Arkansas">Bentonville, Arkansas</a>
 </td></tr>,
 <t

In [34]:
# Step 6: Populate the DataFrame with table data
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    # Fill missing cells with None if needed
    while len(individual_row_data) < len(df.columns):
        individual_row_data.append(None)
    
    # Add the row data to the DataFrame
    df.loc[len(df)] = individual_row_data

In [35]:
# Step 7: Show all rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [36]:
# Display the full DataFrame
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,680985,5.1%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,637959,11.0%,1556000,"Seattle, Washington"
2,3,UnitedHealth Group,Healthcare,400278,7.7%,400000,"Minnetonka, Minnesota"
3,4,Apple,Technology,391035,2.0%,164000,"Cupertino, California"
4,5,CVS Health,Healthcare,372809,4.2%,259500,"Woonsocket, Rhode Island"
5,6,Berkshire Hathaway,Conglomerate,371433,1.9%,392400,"Omaha, Nebraska"
6,7,Alphabet,Technology and cloud computing,350018,13.9%,183323,"Mountain View, California"
7,8,Exxon Mobil,Petroleum,349595,1.5%,60900,"Spring, Texas"
8,9,McKesson Corporation,Healthcare,308951,11.7%,48000,"Irving, Texas"
9,10,Cencora,Pharmacy wholesale,293959,12.1%,44000,"Conshohocken, Pennsylvania"


In [37]:
df.to_csv(r'E:\D_Analyze\Projects\Top Public Companies Web Scraping\PublicCompanies.csv', index = False)