# **List of largest companies by revenue**
### **A web scraping project**

In [147]:
#Importing libraries and modules
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [148]:
#Obtaining the html data from the webpage whose url is provided
sauce = requests.get('https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue').text

In [149]:
#Creating a soup object
soup = BeautifulSoup(sauce,'lxml')

In [150]:
#Confirming the webpage
soup.title.text

'List of largest companies by revenue - Wikipedia'

In [151]:
#Finding all tables and the number of tables in the given website
tables = soup.find_all('tbody')
len(tables)

6

In [152]:
#Finding out the index of the table which has the string "Headquarters" in it
for index, table in enumerate(tables):
    if ('Headquarters' in str(table)):
        table_index = index
print(table_index)

0


In [169]:
#Obtaining the table rows of the required table
required_table_rows = tables[0].find_all('tr')
required_table_rows[0] # getting details of the column headers by calling for row ith zeroth index

<tr>
<th rowspan="2">Rank
</th>
<th rowspan="2">Name
</th>
<th rowspan="2">Industry
</th>
<th>Revenue
</th>
<th>Profit
</th>
<th rowspan="2">Employees
</th>
<th rowspan="2">Headquarters<sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[note 1]</a></sup>
</th>
<th class="unsortable" rowspan="2" scope="col">Ref
</th></tr>

In [154]:
#creating an empty DataFrame
largest_companies = pd.DataFrame(columns = [ 'Name','Industry', 'Revenue','Profit', 'Employee_count','Headquarters'])


#Appending the empty dataframe with data from the required table
for row in required_table_rows:
    col =row.find_all('td')
    if (col !=[]):
        name =col[0].text.strip()
        industry=col[1].text.strip()
        reven=col[2].text.strip()
        profit=col[3].text.strip()
        emp=col[4].text.strip()
        hq=col[5].text.strip()
        largest_companies =largest_companies.append({'Name':name,'Industry':industry, 'Revenue':reven,'Profit':profit, 'Employee_count':emp,'Headquarters':hq}, ignore_index=True)
        
largest_companies        

Unnamed: 0,Name,Industry,Revenue,Profit,Employee_count,Headquarters
0,Walmart,Retail,"$559,200","$19,742",2300000,United States
1,Sinopec Group,Oil and gas,"$407,009","$6,793",582648,China
2,Amazon,"Retail, Information Technology","$386,064","$17,377",1298000,United States
3,State Grid,Electricity,"$383,906","$7,970",907677,China
4,China National Petroleum,Oil and gas,"$379,130","$4,433",1344410,China
5,Royal Dutch Shell,Oil and gas,"$344,379","$15,842",83000,Netherlands United Kingdom
6,Saudi Aramco,Oil and gas,"$329,784","$88,211",79000,Saudi Arabia
7,Volkswagen,Automotive,"$282,760","$15,542",671205,Germany
8,BP,Oil and gas,"$282,610","$4,026",72500,United Kingdom
9,Toyota,Automotive,"$275,288","$19,096",359542,Japan


### **Removing '$' sign and comma from numeric columns**

In [155]:
largest_companies['Revenue'] = largest_companies['Revenue'].str.strip('$').str.replace(',','')

In [156]:
largest_companies['Profit'] = largest_companies['Profit'].str.strip('$').str.replace(',','')


# profit column has an entry with negative value (index 16) and hence has to replace only the $ sign without removing negative sign
largest_companies['Profit']= largest_companies['Profit'].str.replace('$','') 


In [157]:
largest_companies['Employee_count']= largest_companies['Employee_count'].str.replace(',','')

In [158]:
largest_companies

Unnamed: 0,Name,Industry,Revenue,Profit,Employee_count,Headquarters
0,Walmart,Retail,559200,19742,2300000,United States
1,Sinopec Group,Oil and gas,407009,6793,582648,China
2,Amazon,"Retail, Information Technology",386064,17377,1298000,United States
3,State Grid,Electricity,383906,7970,907677,China
4,China National Petroleum,Oil and gas,379130,4433,1344410,China
5,Royal Dutch Shell,Oil and gas,344379,15842,83000,Netherlands United Kingdom
6,Saudi Aramco,Oil and gas,329784,88211,79000,Saudi Arabia
7,Volkswagen,Automotive,282760,15542,671205,Germany
8,BP,Oil and gas,282610,4026,72500,United Kingdom
9,Toyota,Automotive,275288,19096,359542,Japan


### **Converting datatypes**

In [159]:
largest_companies.dtypes

Name              object
Industry          object
Revenue           object
Profit            object
Employee_count    object
Headquarters      object
dtype: object

In [160]:
largest_companies['Revenue']= pd.to_numeric(largest_companies['Revenue'])
largest_companies['Profit']= pd.to_numeric(largest_companies['Profit'])
largest_companies['Employee_count']= pd.to_numeric(largest_companies['Employee_count'])

In [161]:
largest_companies.dtypes

Name              object
Industry          object
Revenue            int64
Profit             int64
Employee_count     int64
Headquarters      object
dtype: object

In [163]:
largest_companies

Unnamed: 0,Name,Industry,Revenue,Profit,Employee_count,Headquarters
0,Walmart,Retail,559200,19742,2300000,United States
1,Sinopec Group,Oil and gas,407009,6793,582648,China
2,Amazon,"Retail, Information Technology",386064,17377,1298000,United States
3,State Grid,Electricity,383906,7970,907677,China
4,China National Petroleum,Oil and gas,379130,4433,1344410,China
5,Royal Dutch Shell,Oil and gas,344379,15842,83000,Netherlands United Kingdom
6,Saudi Aramco,Oil and gas,329784,88211,79000,Saudi Arabia
7,Volkswagen,Automotive,282760,15542,671205,Germany
8,BP,Oil and gas,282610,4026,72500,United Kingdom
9,Toyota,Automotive,275288,19096,359542,Japan


### **Saving the DataFrame into a csv file**

In [167]:
largest_companies.to_csv('largest_companies_by_revenue.csv',index=False)

In [168]:
#Confirming the csv file has been properly created
pd.read_csv('largest_companies_by_revenue.csv')

Unnamed: 0,Name,Industry,Revenue,Profit,Employee_count,Headquarters
0,Walmart,Retail,559200,19742,2300000,United States
1,Sinopec Group,Oil and gas,407009,6793,582648,China
2,Amazon,"Retail, Information Technology",386064,17377,1298000,United States
3,State Grid,Electricity,383906,7970,907677,China
4,China National Petroleum,Oil and gas,379130,4433,1344410,China
5,Royal Dutch Shell,Oil and gas,344379,15842,83000,Netherlands United Kingdom
6,Saudi Aramco,Oil and gas,329784,88211,79000,Saudi Arabia
7,Volkswagen,Automotive,282760,15542,671205,Germany
8,BP,Oil and gas,282610,4026,72500,United Kingdom
9,Toyota,Automotive,275288,19096,359542,Japan
