# Data Scraping Wikipedia + Pandas

In [4]:
# Hello, my name is Renz Llenarez, a Data analyst. Thank you for checking out this code. I use a lot of comments for proper documentation.
# Sometimes, next to a line of code, for me to remember what it actually does.
# The Project Title is 'Data Scraping Wikipedia + Pandas
# Dataset source is from https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue


# first always import all libraries for the project

from bs4 import BeautifulSoup
import requests

In [5]:
# assigning variables to be used throughout the code

url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

page = requests.get(url) # request to get from the 'url'

soup = BeautifulSoup(page.text, 'html') # using beautiful soup library, assigning 'soup' as variable

In [6]:
# displaying website html code
# this is the same code inspected on the website

print(soup.table) # can use (soup.prettify()) to simplify/prettify the code

# can use soup.'table, body, head, etc' to simplify
# i printed the 'table' section, to limit the line of code display for github

<table class="box-More_citations_needed plainlinks metadata ambox ambox-content ambox-Refimprove" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This article <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=List_of_largest_

In [None]:
soup.find_all('table')[1] 

# find all will find all tables in the html code
# since it is a list, we can index by '[x]' where x = position number

#soup.find('table', class_ = 'wikitable sortable') # this line will display the same

# eliminated the display table for better github


In [8]:
# assigning the variable 'table' as the main table that we want

table = soup.find_all('table')[1] 

In [9]:
# displaying all the 'th' tags (all the column names of our table)

table.find_all('th')

[<th>Rank
 </th>,
 <th>Name
 </th>,
 <th>Industry
 </th>,
 <th>Revenue <br/>(USD millions)
 </th>,
 <th>Revenue growth
 </th>,
 <th>Employees
 </th>,
 <th>Headquarters
 </th>]

In [10]:
# assigning above function as variable
    
world_titles = table.find_all('th')

In [11]:
# extracting all the title on each column (th), then using text.strip function to clean the string
# using for loop, through all the content within the 'table'

world_table_titles = [title.text.strip() for title in world_titles]

print(world_table_titles)

['Rank', 'Name', 'Industry', 'Revenue (USD millions)', 'Revenue growth', 'Employees', 'Headquarters']


In [12]:
# starting to import the data into the data frame using pandas

import pandas as pd

In [13]:
# assigning df as the data frame as usual
# using the data extracted from the table as the new columns

df = pd.DataFrame(columns = world_table_titles)

df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters


In [15]:
# filtering 'table' to find all the 'tr', the 'r' stands for 'rows'
# finding all the rows will find all the data as well within each row

column_data = table.find_all('tr') # also, assigning to a variable to be used later

In [22]:
# extracting the data from all the rows, the actual data is in the 'td' tag, the 'd' stands for 'data'

for row in column_data[1:]: #included indexing '[1:]' to start at position 1 # using the 'column_data' which is filtered rows of the table
    row_data = row.find_all('td') # using variable 'row' assigned above, to loop find all the data of each row, then assign in to a variable 'row_data'
    individual_row_data = [data.text.strip() for data in row_data] # nesting for loop to extract the actual data of each row, then clean the string, then assign it to the variable 'individual_row_data'
    #print(individual_row_data) # displaying the data from each row as a LIST
    
    # still part of the nested for loop
    # inserting each data into the 'df' which is currently empty
    length = len(df) # looking at how many rows in the data frame
    df.loc[length] = individual_row_data # 'loc' is the index of the dataframe # checking the 'length' of the data frame each loop then inserting the data in the next position

In [23]:
# successfully made the data frame in pandas

df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,General merchandisers,572754,2.4%,2300000,"Bentonville, Arkansas"
1,2,Amazon,Retail and Cloud Computing,469822,21.7%,1608000,"Seattle, Washington"
2,3,Apple,Electronics industry,365817,33.2%,154000,"Cupertino, California"
3,4,CVS Health,Healthcare,292111,32.0%,258000,"Woonsocket, Rhode Island"
4,5,UnitedHealth Group,Healthcare,287597,11.8%,350000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,General Dynamics,Airspace and defense,38469,8.7%,103100,"Reston, Virginia"
96,97,CHS,Agriculture cooperative,38448,1.4%,9941,"Inver Grove Heights, Minnesota"
97,98,USAA,Financials,37470,3.2%,37335,"San Antonio, Texas"
98,99,Northwestern Mutual,Insurance,36751,8.8%,7585,"Milwaukee, Wisconsin"


In [29]:
# saving the data frame as csv as an actual file

df.to_csv(r'C:\Users\Dash\Desktop\DA\My Portfolio\Companies.csv', index = False) # index = false, because we already have rank column as the index