# **Web Scraping to Get table from the Website**

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

# Read the HTML tables from the URL
dfs = pd.read_html(url)

# Select the first table (assuming it's the desired one)
df = dfs[0]

# Display the DataFrame
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


# **Web Scraping to Get Multiple table from the Website**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# URL of the webpage
base_url = "https://www.scrapethissite.com/pages/forms/"

In [None]:
# Send a GET request to the webpage
response = requests.get(base_url)
response.raise_for_status()  # Check if the request was successful

In [None]:
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Locate the specific table (you can refine the selection based on class or id if needed)
table = soup.find('table')  # Assuming the first table is the one you want; update this if necessary

# Extract table headers
headers = [th.text.strip() for th in table.find_all('th')]

# Extract table rows
rows = table.find_all('tr')[1:]  # Skip the header row
data = [[td.text.strip() for td in row.find_all('td')] for row in rows]

# Create a DataFrame from the extracted data
df1 = pd.DataFrame(data, columns=headers)

In [None]:
df1

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
5,Edmonton Oilers,1990,37,37,,0.463,272,272,0
6,Hartford Whalers,1990,31,38,,0.388,238,276,-38
7,Los Angeles Kings,1990,46,24,,0.575,340,254,86
8,Minnesota North Stars,1990,27,39,,0.338,256,266,-10
9,Montreal Canadiens,1990,39,30,,0.487,273,249,24


In [None]:
# Initialize an empty list to store DataFrames from each page
all_tables = []

# Iterate through all page numbers (2 to 24)
for page_num in range(2, 25):
    # Construct the URL with the current page number
    url = f"{base_url}?page_num={page_num}"

    # Send a GET request to the webpage
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table on the page
    table = soup.find('table')  # Assuming each page has only one table

    # Extract table headers
    headers = [th.text.strip() for th in table.find_all('th')]

    # Extract table rows
    rows = table.find_all('tr')[1:]  # Skip the header row
    data = [[td.text.strip() for td in row.find_all('td')] for row in rows]

    # Create a DataFrame for the current table
    df = pd.DataFrame(data, columns=headers)

    # Append the DataFrame to the list
    all_tables.append(df)

# Combine all DataFrames into a single DataFrame
combined_table = pd.concat(all_tables, ignore_index=True)

In [None]:
combined_df = pd.concat([df1, combined_table], ignore_index=True)
combined_df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
...,...,...,...,...,...,...,...,...,...
577,Tampa Bay Lightning,2011,38,36,8,0.463,235,281,-46
578,Toronto Maple Leafs,2011,35,37,10,0.427,231,264,-33
579,Vancouver Canucks,2011,51,22,9,0.622,249,198,51
580,Washington Capitals,2011,42,32,8,0.512,222,230,-8
