### Scraping the data from the website

* Importing the libraries

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

* Sending the HTTP get request and recieving the response using 'Requests' library

In [81]:
# Sending http get request

# define headers to mimic chrome browser if we get status code = 403

headers = {
    'user-agent' : 'Chrome/143.0.0.0 Safari/537.36'
}

url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_India"
response = requests.get(url, headers=headers)

if response.ok:
   message =  "The request was succesfull"
else:
   message = "The request was not succesfull"

print("status code :", response.status_code)
print(message)



status code : 200
The request was succesfull


* Parsing the HTML content using 'BeautifulSoup' library

In [None]:
#Parse the HTML page
#Viewing the html content using prettify() method
#To retrieve the content of the HTML element, we can use the .text attribute

soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())

* Finding all the tables from html content using find_all method.


In [96]:
soup.find_all("table")

* Identifying and extracting column headers from the HTML table.


In [16]:
#Finding the table header
table1 = soup.find("tr")
print(table1)

<tr>
<th align="center">Rank
</th>
<th align="center">Forbes<br/> 2000 rank
</th>
<th align="center">Name
</th>
<th align="center">Headquarters
</th>
<th align="center">Revenue<br/>(billions US$)
</th>
<th align="center">Profit<br/>(billions US$)
</th>
<th align="center">Assets<br/>(billions US$)
</th>
<th align="center">Value<br/>(billions US$)
</th>
<th align="center">Industry
</th></tr>


* Storing the table headers in a list.


In [17]:
#To retrieve the content of the HTML element, we can use the .text attribute
#List comprehension is used to create new list in single line of code and is more readable

list_table = [data.text.strip() for data in table1]
print(list_table)

['', 'Rank', '', 'Forbes 2000 rank', '', 'Name', '', 'Headquarters', '', 'Revenue(billions US$)', '', 'Profit(billions US$)', '', 'Assets(billions US$)', '', 'Value(billions US$)', '', 'Industry']


In [21]:
#Cleaning the extra spaces present in the above list

clean_table_headers = [i for i in list_table if i.strip()]
print(clean_table_headers)

['Rank', 'Forbes 2000 rank', 'Name', 'Headquarters', 'Revenue(billions US$)', 'Profit(billions US$)', 'Assets(billions US$)', 'Value(billions US$)', 'Industry']


* The extracted table headers are stored as column names in a Pandas DataFrame.


In [22]:
#Create a DataFrame and add the columns

df = pd.DataFrame(columns=clean_table_headers)
df

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry


* Finding all the table on the webpage and selecting the first one


In [None]:

table1 = soup.find_all("table")[0]
print(table1)

* Storing the extracted table row data in a list for further processing.


In [99]:

rows = table1.find_all("tr")

for i in rows[1:]:
    rows_list = i.find_all("td")
    clean_row = [j.text.strip() for j in rows_list]
    print(clean_row)

['1', '49', 'Reliance Industries Limited', 'Mumbai', '108.8', '8.4', '210.5', '233.1', 'Conglomerate']
['2', '55', 'State Bank of India', 'Mumbai', '71.8', '8.1', '807.4', '87.6', 'Banking']
['3', '65', 'HDFC Bank', 'Mumbai', '49.3', '7.7', '483.2', '133.6', 'Banking']
['4', '70', 'Life Insurance Corporation', 'New Delhi', '98.0', '4.9', '561.4', '73.6', 'Insurance']
['5', '142', 'ICICI Bank', 'Mumbai', '28.5', '5.3', '283.5', '95.3', 'Banking']
['6', '207', 'Oil and Natural Gas Corporation', 'New Delhi', '77.5', '5.1', '80.6', '41.9', 'Oil and gas']
['7', '259', 'Indian Oil Corporation', 'New Delhi', '93.8', '5.0', '57.8', '27.8', 'Oil and gas']
['8', '284', 'Tata Motors', 'Mumbai', '52.9', '3.8', '44.4', '43.8', 'Automotive']
['9', '293', 'Axis Bank', 'Mumbai', '16.7', '3.2', '182.0', '42.3', 'Banking']
['10', '372', 'NTPC Limited', 'New Delhi', '21.2', '2.4', '54.7', '42.5', 'Utilities']
['11', '398', 'Larsen & Toubro', 'Mumbai', '26.7', '1.6', '40.7', '56.9', 'Capital goods']
['12'

In [67]:
df

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry


* Adding the extracted data to the DataFrame.
* We can insert rows into the DataFrame using the .loc attribute


In [71]:

for i in rows[1:]:
    rows_list = i.find_all("td")
    clean_row = [j.text.strip() for j in rows_list]

    length = len(df)
    df.loc[length] = clean_row

* The table is created to organize the scraped data in a structured format.


In [70]:
#Table is created
df

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry
0,1,49,Reliance Industries Limited,Mumbai,108.8,8.4,210.5,233.1,Conglomerate
1,2,55,State Bank of India,Mumbai,71.8,8.1,807.4,87.6,Banking
2,3,65,HDFC Bank,Mumbai,49.3,7.7,483.2,133.6,Banking
3,4,70,Life Insurance Corporation,New Delhi,98.0,4.9,561.4,73.6,Insurance
4,5,142,ICICI Bank,Mumbai,28.5,5.3,283.5,95.3,Banking
...,...,...,...,...,...,...,...,...,...
66,65,1895,Dr. Reddy's Laboratories,Hyderabad,3.4,0.7,4.6,11.6,Pharmaceuticals
67,66,1908,Varun Beverages,Gurgaon,2.0,0.3,1.8,23.6,Beverages
68,67,1949,CIFCL,Chennai,2.3,0.4,18.8,13.0,Financials
69,68,1957,NMDC,Hyderabad,2.5,0.8,3.9,9.7,Mining


* The extracted data is saved in CSV format.


In [75]:
#Saving the DataFrame in CSV format
filepath = "largest_companies_in_india.csv"

df.to_csv(filepath, index=False)