# Scraping

In [116]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [117]:
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_India#2024_Forbes_list"
page = requests.get(url)
soup = BeautifulSoup(page.text, "html")

In [118]:
# soup.find_all('table')[0]
table = soup.find('table', class_="wikitable sortable") 

In [119]:
Titles_html = table.find_all('th')
Titles_html

[<th align="center">Rank
 </th>,
 <th align="center">Forbes<br/> 2000 rank
 </th>,
 <th align="center">Name
 </th>,
 <th align="center">Headquarters
 </th>,
 <th align="center">Revenue<br/>(billions US$)
 </th>,
 <th align="center">Profit<br/>(billions US$)
 </th>,
 <th align="center">Assets<br/>(billions US$)
 </th>,
 <th align="center">Value<br/>(billions US$)
 </th>,
 <th align="center">Industry
 </th>]

In [120]:
Titles = [title.text.strip() for title in Titles_html]
Titles

['Rank',
 'Forbes 2000 rank',
 'Name',
 'Headquarters',
 'Revenue(billions US$)',
 'Profit(billions US$)',
 'Assets(billions US$)',
 'Value(billions US$)',
 'Industry']

In [121]:
df  = pd.DataFrame(columns= Titles)
df

Unnamed: 0,Rank,Forbes 2000 rank,Name,Headquarters,Revenue(billions US$),Profit(billions US$),Assets(billions US$),Value(billions US$),Industry


In [122]:
column_data = table.find_all("tr")

In [123]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    Rows = [data.text.strip() for data in row_data]

    length = len(df)
    df.loc[length] = Rows

In [144]:
df.head(5)

Unnamed: 0,Rank,Forbes Rank,Name,HQ,Revenue (USD Billions),Profit (USD Billions),Assets (USD Billions),Market Value (USD Billions),Industry
0,1,49,Reliance Industries Limited,Mumbai,108.8,8.4,210.5,233.1,Conglomerate
1,2,55,State Bank of India,Mumbai,71.8,8.1,807.4,87.6,Banking
2,3,65,HDFC Bank,Mumbai,49.3,7.7,483.2,133.6,Banking
3,4,70,Life Insurance Corporation,New Delhi,98.0,4.9,561.4,73.6,Insurance
4,5,142,ICICI Bank,Mumbai,28.5,5.3,283.5,95.3,Banking


# Clean and Preprocessing the data

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71 entries, 0 to 70
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Rank                   71 non-null     object
 1   Forbes 2000 rank       71 non-null     object
 2   Name                   71 non-null     object
 3   Headquarters           71 non-null     object
 4   Revenue(billions US$)  71 non-null     object
 5   Profit(billions US$)   71 non-null     object
 6   Assets(billions US$)   71 non-null     object
 7   Value(billions US$)    71 non-null     object
 8   Industry               71 non-null     object
dtypes: object(9)
memory usage: 5.5+ KB


Rename the Columns

In [126]:
df.columns = ['Rank', 'Forbes Rank', 'Name',"HQ", 'Revenue (USD Billions)', 
              'Profit (USD Billions)', 'Assets (USD Billions)', 'Market Value (USD Billions)', "Industry"]

In [143]:
df.set_index("Rank").head(5)

Unnamed: 0_level_0,Forbes Rank,Name,HQ,Revenue (USD Billions),Profit (USD Billions),Assets (USD Billions),Market Value (USD Billions),Industry
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,49,Reliance Industries Limited,Mumbai,108.8,8.4,210.5,233.1,Conglomerate
2,55,State Bank of India,Mumbai,71.8,8.1,807.4,87.6,Banking
3,65,HDFC Bank,Mumbai,49.3,7.7,483.2,133.6,Banking
4,70,Life Insurance Corporation,New Delhi,98.0,4.9,561.4,73.6,Insurance
5,142,ICICI Bank,Mumbai,28.5,5.3,283.5,95.3,Banking


In [128]:
df.to_csv("2024 Forbes list.csv")   # Export Scraped data into a CSV file 

Convert numerical columns to appropriate data types

In [129]:
df['Forbes Rank'] = pd.to_numeric(df['Forbes Rank'], errors='coerce')
df['Revenue (USD Billions)'] = pd.to_numeric(df['Revenue (USD Billions)'], errors='coerce')
df['Profit (USD Billions)'] = pd.to_numeric(df['Profit (USD Billions)'], errors='coerce')
df['Assets (USD Billions)'] = pd.to_numeric(df['Assets (USD Billions)'], errors='coerce')
df['Market Value (USD Billions)'] = pd.to_numeric(df['Market Value (USD Billions)'], errors='coerce')

In [163]:
df.describe() # Statistical Summary

Unnamed: 0,Forbes Rank,Revenue (USD Billions),Profit (USD Billions),Assets (USD Billions),Market Value (USD Billions)
count,71.0,71.0,70.0,71.0,71.0
mean,1048.267606,17.812676,1.727143,71.621127,33.588732
std,608.968729,23.347499,1.927522,132.730041,37.901853
min,49.0,0.2,0.0,1.3,1.1
25%,535.0,4.15,0.5,10.55,12.0
50%,947.0,10.8,0.95,27.4,23.6
75%,1695.0,17.75,2.275,58.2,39.0
max,1980.0,108.8,8.4,807.4,233.1


# Analyse the Data with Ploty

In [130]:
import plotly.graph_objs as go
import plotly.express as px

Top 5 Revenue Generating Companies

In [131]:
top_revenue = df.nlargest(5, 'Revenue (USD Billions)')[['Name', 'Revenue (USD Billions)']]
print("Top 5 Companies by Revenue:")
print(top_revenue)

Top 5 Companies by Revenue:
                              Name  Revenue (USD Billions)
0      Reliance Industries Limited                   108.8
3       Life Insurance Corporation                    98.0
6           Indian Oil Corporation                    93.8
5  Oil and Natural Gas Corporation                    77.5
1              State Bank of India                    71.8


In [154]:
fig_revenue = px.bar(top_revenue, x='Name', y='Revenue (USD Billions)', title='Top 5 Revenue Generating Companies', color='Name', text='Revenue (USD Billions)')
fig_revenue.update_layout(showlegend=False)
fig_revenue.update_layout(yaxis_title=None,  width=1200, height=500)
fig_revenue.update_traces(textposition='auto',texttemplate='%{text:.2f} Billions')
fig_revenue.update_yaxes(showgrid=False)
fig_revenue.update_yaxes(showticklabels=False) 
fig_revenue

Top 5 Profit Distribution of Companies

In [133]:
top_profit = df.nlargest(5, 'Profit (USD Billions)')[['Name', 'Profit (USD Billions)']]
print("\nTop 5 Companies by Profit:")
print(top_profit)


Top 5 Companies by Profit:
                           Name  Profit (USD Billions)
0   Reliance Industries Limited                    8.4
1           State Bank of India                    8.1
2                     HDFC Bank                    7.7
11    Tata Consultancy Services                    5.5
4                    ICICI Bank                    5.3


In [155]:
# Bar chart for Top 5 Companies by Profit
fig_profit = px.bar(top_profit, x='Name', y='Profit (USD Billions)', title='Top 5 Profit Distribution of Companies', color="Name", text="Profit (USD Billions)")
fig_profit.update_layout(showlegend = False)
fig_profit.update_layout(yaxis_title = None, xaxis_title = "Company Name", width = 1200, height = 500 )
fig_profit.update_traces(textposition="auto", texttemplate = "%{text:.1f} Billions")
fig_profit.update_yaxes(showticklabels =False, showgrid=False)
fig_profit

Top Companies Ranked by their Asset Value

In [135]:
top_assert = df.nlargest(10, 'Assets (USD Billions)')[['Name', 'Assets (USD Billions)']]
print("\n Top 10 Companies with their assets")
print(top_assert) 


 Top 10 Companies with their assets
                           Name  Assets (USD Billions)
1           State Bank of India                  807.4
3    Life Insurance Corporation                  561.4
2                     HDFC Bank                  483.2
4                    ICICI Bank                  283.5
0   Reliance Industries Limited                  210.5
12               Bank of Baroda                  198.4
20         Punjab National Bank                  191.7
16                  Canara Bank                  184.0
8                     Axis Bank                  182.0
21          Union Bank of India                  168.1


In [156]:
fig_assert = px.treemap(top_assert,path=["Name"], values="Assets (USD Billions)", title="Top Companies Ranked by their Asset Value")
fig_assert

The top-performing companies within each industry

In [153]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

industry_Name = df.groupby(["Industry", "Name"])['Revenue (USD Billions)'].mean()
print(industry_Name)

Industry               Name                              
Aerospace and defense  Bharat Electronics                      2.2
Airline                IndiGo                                  7.9
Automotive             Bajaj Auto                              5.4
                       Mahindra & Mahindra                    16.8
                       Motherson Sumi Systems                 11.4
                       Tata Motors                            52.9
Banking                Axis Bank                              16.7
                       Bank of Baroda                         17.1
                       Bank of India                           8.1
                       Bank of Maharashtra                     2.8
                       Canara Bank                            16.8
                       Central Bank of India                   4.3
                       Federal Bank                            3.2
                       HDFC Bank                              49.3
    

Top 10 Industry-wise average revenue

In [148]:
industry_revenue = df.groupby('Industry')['Revenue (USD Billions)'].mean().sort_values(ascending=False)
top_industry = industry_revenue.head(10)
top_industry # Print Top 10 Industries

Industry
Conglomerate          108.800000
Oil and gas            60.375000
Insurance              51.850000
Automotive             21.625000
Metals and mining      20.233333
Gems and jewellery     19.400000
Infotech               17.950000
Iron and steel         17.075000
Banking                16.547059
Diversified            15.300000
Name: Revenue (USD Billions), dtype: float64

In [140]:
fig_industry = px.pie(top_industry, names=top_industry.index, values=top_industry, title='Top 10 Industry-wise Average Revenue', 
                      color_discrete_sequence=px.colors.sequential.RdBu, hole=.5)
fig_industry.update_layout(width=800, height=500)
fig_industry

Distribution of Headquarters

In [141]:
hq_count = df['HQ'].value_counts()
hq_count

HQ
Mumbai       29
New Delhi    11
Bangalore     7
Gurgaon       5
Ahmedabad     4
Kolkata       3
Pune          3
Chennai       3
Noida         2
Hyderabad     2
Vadodara      1
Kochi         1
Name: count, dtype: int64

In [142]:
fig_hq = px.scatter(hq_count, x=hq_count.index,y=hq_count.values, 
                    size=hq_count.values, color=hq_count.index, 
                    text=hq_count.values, 
                    title="Distribution of Company Headquarters")
fig_hq

Correlation Analysis 

In [174]:
correlation = df[['Revenue (USD Billions)', 
              'Profit (USD Billions)', 'Assets (USD Billions)', 'Market Value (USD Billions)']].corr()
correlation 

Unnamed: 0,Revenue (USD Billions),Profit (USD Billions),Assets (USD Billions),Market Value (USD Billions)
Revenue (USD Billions),1.0,0.812091,0.573631,0.585903
Profit (USD Billions),0.812091,1.0,0.692479,0.775782
Assets (USD Billions),0.573631,0.692479,1.0,0.422375
Market Value (USD Billions),0.585903,0.775782,0.422375,1.0


In [172]:
fig_Correlation = go.Figure(data=go.Heatmap(z=correlation, x=correlation.columns, y=correlation.columns, colorscale='RdBu', zmin=-1, zmax=1))
fig_Correlation.update_layout(title='Correlation Heatmap')
fig_Correlation