# Scraping

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_India#2024_Forbes_list"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html")

In [4]:
# soup.find_all('table')[0]
table = soup.find('table', class_="wikitable sortable")  

In [5]:
Titles_html = table.find_all('th')
Titles_html

[<th align="center">Rank
 </th>,
 <th align="center">Forbes<br/> 2000 rank
 </th>,
 <th align="center">Name
 </th>,
 <th align="center">Headquarters
 </th>,
 <th align="center">Revenue<br/>(billions US$)
 </th>,
 <th align="center">Profit<br/>(billions US$)
 </th>,
 <th align="center">Assets<br/>(billions US$)
 </th>,
 <th align="center">Value<br/>(billions US$)
 </th>,
 <th align="center">Industry
 </th>]

In [6]:
Titles = [title.text.strip() for title in Titles_html]
Titles

['Rank',
 'Forbes 2000 rank',
 'Name',
 'Headquarters',
 'Revenue(billions US$)',
 'Profit(billions US$)',
 'Assets(billions US$)',
 'Value(billions US$)',
 'Industry']

In [7]:
df  = pd.DataFrame(columns= Titles)
df

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry


In [8]:
column_data = table.find_all("tr")

In [9]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    Rows = [data.text.strip() for data in row_data]

    length = len(df)
    df.loc[length] = Rows

In [10]:
df 

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry
0,1,49,Reliance Industries Limited,Mumbai,108.8,8.4,210.5,233.1,Conglomerate
1,2,55,State Bank of India,Mumbai,71.8,8.1,807.4,87.6,Banking
2,3,65,HDFC Bank,Mumbai,49.3,7.7,483.2,133.6,Banking
3,4,70,Life Insurance Corporation,New Delhi,98.0,4.9,561.4,73.6,Insurance
4,5,142,ICICI Bank,Mumbai,28.5,5.3,283.5,95.3,Banking
...,...,...,...,...,...,...,...,...,...
66,65,1895,Dr. Reddy's Laboratories,Hyderabad,3.4,0.7,4.6,11.6,Pharmaceuticals
67,66,1908,Varun Beverages,Gurgaon,2.0,0.3,1.8,23.6,Beverages
68,67,1949,CIFCL,Chennai,2.3,0.4,18.8,13.0,Financials
69,68,1957,NMDC,Hyderabad,2.5,0.8,3.9,9.7,Mining


In [11]:
df.to_csv("2024 Forbes list")

# Clean and Preprocessing the data

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71 entries, 0 to 70
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Rank                   71 non-null     object
 1   Forbes 2000 rank       71 non-null     object
 2   Name                   71 non-null     object
 3   Headquarters           71 non-null     object
 4   Revenue(billions US$)  71 non-null     object
 5   Profit(billions US$)   71 non-null     object
 6   Assets(billions US$)   71 non-null     object
 7   Value(billions US$)    71 non-null     object
 8   Industry               71 non-null     object
dtypes: object(9)
memory usage: 5.5+ KB


Rename the Columns

In [15]:
df.columns = ['Rank', 'Forbes Rank', 'Name',"HQ", 'Revenue (USD Billions)', 
              'Profit (USD Billions)', 'Assets (USD Billions)', 'Market Value (USD Billions)', "Industry"]

Convert numerical columns to appropriate data types

In [17]:
df['Forbes Rank'] = pd.to_numeric(df['Forbes Rank'], errors='coerce')
df['Revenue (USD Billions)'] = pd.to_numeric(df['Revenue (USD Billions)'], errors='coerce')
df['Profit (USD Billions)'] = pd.to_numeric(df['Profit (USD Billions)'], errors='coerce')
df['Assets (USD Billions)'] = pd.to_numeric(df['Assets (USD Billions)'], errors='coerce')
df['Market Value (USD Billions)'] = pd.to_numeric(df['Market Value (USD Billions)'], errors='coerce')

# Analyse the Data

Top 5 companies by revenue

In [20]:
top_revenue = df.nlargest(5, 'Revenue (USD Billions)')[['Name', 'Revenue (USD Billions)']]
print("Top 5 Companies by Revenue:")
print(top_revenue)

Top 5 Companies by Revenue:
                              Name  Revenue (USD Billions)
0      Reliance Industries Limited                   108.8
3       Life Insurance Corporation                    98.0
6           Indian Oil Corporation                    93.8
5  Oil and Natural Gas Corporation                    77.5
1              State Bank of India                    71.8


Top 5 companies by profit

In [22]:
top_profit = df.nlargest(5, 'Profit (USD Billions)')[['Name', 'Profit (USD Billions)']]
print("\nTop 5 Companies by Profit:")
print(top_profit)


Top 5 Companies by Profit:
                           Name  Profit (USD Billions)
0   Reliance Industries Limited                    8.4
1           State Bank of India                    8.1
2                     HDFC Bank                    7.7
11    Tata Consultancy Services                    5.5
4                    ICICI Bank                    5.3


Industry-wise average revenue

In [23]:
industry_revenue = df.groupby('Industry')['Revenue (USD Billions)'].mean().sort_values(ascending=False)
print("\nIndustry-wise Average Revenue:")
print(industry_revenue)


Industry-wise Average Revenue:
Industry
Conglomerate             108.800000
Oil and gas               60.375000
Insurance                 51.850000
Automotive                21.625000
Metals and mining         20.233333
Gems and jewellery        19.400000
Infotech                  17.950000
Iron and steel            17.075000
Banking                   16.547059
Diversified               15.300000
Capital goods             14.000000
Utilities                 11.400000
Telecommunication         10.800000
Consumer Goods             8.500000
Airline                    7.900000
Holding                    6.450000
Energy                     6.100000
Retail                     6.100000
Financials                 5.357143
Pharmaceuticals            4.550000
Chemicals                  4.300000
Shipping                   3.200000
Mining                     2.500000
Aerospace and defense      2.200000
Beverages                  2.000000
Renewable energy           1.100000
Real estate            

# Create a Dashboard Using Plotly

In [24]:
import plotly.graph_objs as go
import plotly.express as px

In [27]:
fig_revenue = px.bar(top_revenue, x='Name', y='Revenue (USD Billions)', title='Top 5 Companies by Revenue')
fig_revenue

In [29]:
# Bar chart for Top 5 Companies by Profit
fig_profit = px.bar(top_profit, x='Name', y='Profit (USD Billions)', title='Top 5 Companies by Profit')
fig_profit

In [30]:
# Pie chart for Industry-wise Average Revenue
fig_industry = px.pie(industry_revenue, names=industry_revenue.index, values=industry_revenue, title='Industry-wise Average Revenue')
fig_industry