# Scraping Data from a Real Website + Pandas

https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue

In [3]:
from bs4 import BeautifulSoup
import requests

In [16]:
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

In [44]:

# Add a polite User-Agent (so Wikipedia knows it’s you, not a bot)
headers = {
    "User-Agent": "Rohita (https://github.com/rohita) Python requests for learning purposes"
}

# Send the request
page = requests.get(url, headers=headers)

print(page)  # should show <Response [200]> if successful

# Parse the page
soup = BeautifulSoup(page.text, 'html.parser')

# Just check that it worked
print(soup.title.text)


<Response [200]>
List of largest companies in the United States by revenue - Wikipedia


In [45]:
table= soup.find('table',class_="wikitable sortable")

In [36]:
print(table)

<table class="wikitable sortable">
<caption>
</caption>
<tbody><tr>
<th>Rank
</th>
<th>Name
</th>
<th>Industry
</th>
<th>Revenue <br/>(USD millions)
</th>
<th>Revenue growth
</th>
<th>Employees
</th>
<th>Headquarters
</th></tr>
<tr>
<td>1
</td>
<td><a href="/wiki/Walmart" title="Walmart">Walmart</a>
</td>
<td><a href="/wiki/Retail" title="Retail">Retail</a>
</td>
<td style="text-align:center;">680,985
</td>
<td style="text-align:center;"><span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/20px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/40px-Increase2.svg.png 2x" width="11"/></span></span> <span data-sort-value="7000300000000000000♠" style="display:none"></span> 5.1%
</td>
<td style="text-align:center;">2,100,000
</td>
<td><a href="/wiki/Bentonville,_Arkansa

In [51]:
world_titles = table.find_all('th')

In [52]:
world_titles

[<th>Rank
 </th>,
 <th>Name
 </th>,
 <th>Industry
 </th>,
 <th>Revenue <br/>(USD millions)
 </th>,
 <th>Revenue growth
 </th>,
 <th>Employees
 </th>,
 <th>Headquarters
 </th>]

In [53]:
world_titles_titles = [title.text.strip() for title in world_titles]

In [54]:
print(world_titles_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [55]:
import pandas as pd

In [58]:
df = pd.DataFrame(columns = world_titles_titles)
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [72]:
column_data = table.find_all('tr')

In [90]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    length = len(df)
    df.loc[length] = individual_row_data

In [91]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,680985,5.1%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,637959,11.0%,1556000,"Seattle, Washington"
2,3,UnitedHealth Group,Healthcare,400278,7.7%,400000,"Minnetonka, Minnesota"
3,4,Apple,Electronics industry,391035,2.0%,164000,"Cupertino, California"
4,5,CVS Health,Healthcare,372809,4.2%,259500,"Woonsocket, Rhode Island"
...,...,...,...,...,...,...,...
297,96,General Dynamics,Aerospace and defense,47716,12.9%,117000,"Reston, Virginia"
298,97,Coca-Cola,Beverage,47061,2.9%,69700,"Atlanta, Georgia"
299,98,TIAA,Financials,46946,2.6%,15623,"New York City, New York"
300,99,The Travelers Companies,Insurance,46423,12.2%,34000,"New York City, New York"


In [None]:
# Save to CSV inside your folder
file_path = r'/Users/rohita/file_sorting_files/csv files/Companies.csv'

try:
    df.to_csv(file_path, index=False)
    print("✅ File saved successfully at:", file_path)
except Exception as e:
    print("❌ Error saving file:", e)