In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://www.forbes.com/lists/global2000/"
response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

In [3]:
rows = soup.select("a.table-row")
print(len(rows))

2000


In [4]:
data = []

for row in rows:
    def get_value(selector):
        cell = row.select_one(selector)
        return cell.get_text(strip=True) if cell else None

    record = {
        "Rank": get_value("div.rank .row-cell-value"),
        "Company": get_value("div.organizationName .row-cell-value"),
        "Headquarters": get_value("div.headquarters .row-cell-value"),
        "Industry": get_value("div.industry .row-cell-value"),
        "Revenue ($)": get_value("div.revenue .row-cell-value"),
        "Profits ($)": get_value("div.profits .row-cell-value"),
        "Assets ($)": get_value("div.assets .row-cell-value"),
        "Market Value ($)": get_value("div.marketValue .row-cell-value"),
    }

    data.append(record)

In [5]:
df = pd.DataFrame(data)

print(df.head())

  Rank                                   Company   Headquarters  \
0    1                             JPMorganChase  United States   
1    2                        Berkshire Hathaway  United States   
2    3                                      ICBC          China   
3    4  Saudi Arabian Oil Company (Saudi Aramco)   Saudi Arabia   
4    5                                    Amazon  United States   

               Industry Revenue ($) Profits ($)   Assets ($) Market Value ($)  
0               Banking   $285.11 B    $59.36 B  $4,357.86 B         $677.8 B  
1             Insurance   $371.43 B       $89 B  $1,153.88 B      $1,145.46 B  
2               Banking   $221.96 B    $50.84 B   $6,688.6 B        $251.33 B  
3  Oil & Gas Operations   $480.15 B   $104.97 B    $645.03 B      $1,663.38 B  
4  Retail and Wholesale   $637.96 B    $59.25 B    $624.89 B      $2,005.64 B  


In [6]:
print(df.columns)

Index(['Rank', 'Company', 'Headquarters', 'Industry', 'Revenue ($)',
       'Profits ($)', 'Assets ($)', 'Market Value ($)'],
      dtype='object')


In [7]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [8]:
print(df.head())

  Rank                                   Company   Headquarters  \
0    1                             JPMorganChase  United States   
1    2                        Berkshire Hathaway  United States   
2    3                                      ICBC          China   
3    4  Saudi Arabian Oil Company (Saudi Aramco)   Saudi Arabia   
4    5                                    Amazon  United States   

               Industry Revenue ($) Profits ($)   Assets ($) Market Value ($)  
0               Banking   $285.11 B    $59.36 B  $4,357.86 B         $677.8 B  
1             Insurance   $371.43 B       $89 B  $1,153.88 B      $1,145.46 B  
2               Banking   $221.96 B    $50.84 B   $6,688.6 B        $251.33 B  
3  Oil & Gas Operations   $480.15 B   $104.97 B    $645.03 B      $1,663.38 B  
4  Retail and Wholesale   $637.96 B    $59.25 B    $624.89 B      $2,005.64 B  


In [15]:
num_cols = ['Revenue ($)', 'Profits ($)', 'Assets ($)', 'Market Value ($)']

In [12]:
import numpy as np

In [13]:
def clean_money_full(x):
    if pd.isna(x):
        return np.nan
    
    x = x.replace('$', '').replace(',', '').strip()
    
    if x.endswith('B'):
        return float(x.replace('B', '').strip()) * 1_000_000_000
    elif x.endswith('M'):
        return float(x.replace('M', '').strip()) * 1_000_000
    else:
        return float(x)

In [16]:
for col in num_cols:
    df[col] = df[col].apply(clean_money_full)

In [20]:
pd.options.display.float_format = '{:,.0f}'.format

In [21]:
print(df.head())

  Rank                                   Company   Headquarters  \
0    1                             JPMorganChase  United States   
1    2                        Berkshire Hathaway  United States   
2    3                                      ICBC          China   
3    4  Saudi Arabian Oil Company (Saudi Aramco)   Saudi Arabia   
4    5                                    Amazon  United States   

               Industry     Revenue ($)     Profits ($)        Assets ($)  \
0               Banking 285,110,000,000  59,360,000,000 4,357,860,000,000   
1             Insurance 371,430,000,000  89,000,000,000 1,153,880,000,000   
2               Banking 221,960,000,000  50,840,000,000 6,688,600,000,000   
3  Oil & Gas Operations 480,150,000,000 104,970,000,000   645,030,000,000   
4  Retail and Wholesale 637,960,000,000  59,250,000,000   624,890,000,000   

   Market Value ($)  
0   677,800,000,000  
1 1,145,460,000,000  
2   251,330,000,000  
3 1,663,380,000,000  
4 2,005,640,000,000  


In [9]:
import os

In [22]:
downloads_path = os.path.join(os.path.expanduser("~"), "Downloads")

file_path = os.path.join(downloads_path, "capstone_forbes_global_2000_2025.csv")

In [23]:
df.to_csv(file_path, index=False, encoding="utf-8")
print(f"CSV saved to: {file_path}")

CSV saved to: C:\Users\USER\Downloads\capstone_forbes_global_2000_2025.csv
