# Web Scraping with Selectors

## 1. Create a Function to Scrape Company Infoboxes

### a

In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as p
import re

In [44]:
def normalize_to_billion(value):
    """Normalize monetary values to billions of USD."""
    if not value:
        return None
    try:
        value = value.replace(",", "").replace("$", "").lower()
        if "billion" in value:
            return float(re.search(r"\d+(\.\d+)?", value).group())
        elif "million" in value:
            return float(re.search(r"\d+(\.\d+)?", value).group()) / 1000
    except Exception:
        return None
    return None

In [45]:
def get_company_info(wikipedia_url):
    """
    Extracts basic company information such as revenue, industry, etc.
    """
    try:
        # Request the page
        response = requests.get(wikipedia_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Locate the infobox
        infobox = soup.find("table", class_=lambda x: x and "infobox" in x)
        if not infobox:
            print("Infobox not found.")
            return None

        # Initialize data dictionary
        data = {
            "Company Name": None,
            "Industry": None,
            "Revenue (Billions USD)": None,
            "Net Income (Billions USD)": None,
            "Number of Employees": None,
            "Market Cap (Billions USD)": None,
        }

        # Extract rows from the infobox
        rows = infobox.find_all("tr")
        for row in rows:
            header = row.find("th")
            if header:
                header_text = header.text.strip().lower()
                value_cell = row.find("td")
                if value_cell:
                    value_text = value_cell.text.strip()
                    
                    if "industry" in header_text:
                        industries = re.split(r"(?<=[a-z])(?=[A-Z])|\n", value_text)
                        data["Industry"] = ", ".join([industry.strip() for industry in industries])
                    elif "revenue" in header_text:
                        data["Revenue (Billions USD)"] = normalize_to_billion(value_text)
                    elif "net income" in header_text:
                        data["Net Income (Billions USD)"] = normalize_to_billion(value_text)
                    elif "number of employees" in header_text:
                        employee_match = re.search(r"\b\d{1,3}(?:,\d{3})*\b", value_text)
                        data["Number of Employees"] = employee_match.group(0).replace(",", "") if employee_match else None
                    elif "market cap" in header_text:
                        data["Market Cap (Billions USD)"] = normalize_to_billion(value_text)

        # Set Company Name
        company_name = soup.find("h1", {"id": "firstHeading"}).text
        data["Company Name"] = company_name

        return data

    except Exception as e:
        print(f"Error in get_company_info: {e}")
        return None

In [46]:
# Testing with Apple Inc.
apple_url = "https://en.wikipedia.org/wiki/Apple_Inc."
apple_info = get_company_info(apple_url)
print(apple_info)


{'Company Name': 'Apple Inc.', 'Industry': 'Consumer electronics, Software services, Online services', 'Revenue (Billions USD)': 391.04, 'Net Income (Billions USD)': 93.74, 'Number of Employees': '164000', 'Market Cap (Billions USD)': None}


### b

In [67]:
def expand_company_info_with_key_people(data, infobox):
    """
    Expands the company data dictionary with key people information:
    CEO, Founder(s), and Founded (Year).
    """
    try:
        # Extract rows from the infobox
        rows = infobox.find_all("tr")
        for row in rows:
            header = row.find("th")
            if header:
                header_text = header.text.strip().lower()
                value_cell = row.find("td")
                if value_cell:
                    value_text = value_cell.text.strip()
                    
                    # CEO extraction
                    if "key people" in header_text:
                        # Split the key people section by newline or commas
                        key_people = re.split(r"\n|, ", value_text)
                        for person in key_people:
                            if "ceo" in person.lower():
                                ceo_match = re.search(r"(.*?)(?=\(|,|$)", person)
                                if ceo_match:
                                    data["CEO"] = ceo_match.group(1).strip()
                                    break  # Stop after finding the CEO
                    
                    # Founder(s) extraction
                    elif "founder" in header_text:
                        # Use regex to split concatenated names or separators
                        founders = re.split(r"(?:\s{2,}|\n|,| and )", value_text)
                        data["Founder(s)"] = ", ".join([founder.strip() for founder in founders if founder.strip()])
                    
                    # Founded (Year) extraction
                    elif "founded" in header_text:
                        year_match = re.search(r"\b(1[89]|20)\d{2}\b", value_text)
                        if year_match:
                            data["Founded (Year)"] = year_match.group()

    except Exception as e:
        print(f"Error while expanding key people information: {e}")


In [68]:
def get_company_info_with_key_people(wikipedia_url):
    """
    Combines the original get_company_info with expanded key people details.
    """
    try:
        # Request the page
        response = requests.get(wikipedia_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Locate the infobox
        infobox = soup.find("table", class_=lambda x: x and "infobox" in x)
        if not infobox:
            print("Infobox not found.")
            return None
        
        # Use the original function to extract basic data
        data = get_company_info(wikipedia_url)

        # Expand with key people information
        expand_company_info_with_key_people(data, infobox)
        
        return data

    except Exception as e:
        print(f"Error in get_company_info_with_key_people: {e}")
        return None


In [69]:
# Testing with Apple Inc.
apple_url = "https://en.wikipedia.org/wiki/Apple_Inc."
apple_info = get_company_info_with_key_people(apple_url)
print(apple_info)

{'Company Name': 'Apple Inc.', 'Industry': 'Consumer electronics, Software services, Online services', 'Revenue (Billions USD)': 391.04, 'Net Income (Billions USD)': 93.74, 'Number of Employees': '164000', 'Market Cap (Billions USD)': None, 'Founded (Year)': '1976', 'Founder(s)': 'Steve JobsSteve WozniakRonald Wayne', 'CEO': 'Arthur Levinson'}


## 2 Retrieve S&P 500 Table

In [70]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [71]:
# URL of the S&P 500 Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

# Send a GET request to fetch the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the first table on the page (S&P 500 Constituents)
table = soup.find('table', {'class': 'wikitable'})

# Extract table rows contained in <tr> tags
rows = table.find_all('tr')

# Extract data from the table rows
data = []
for row in rows[1:]:  # Skip the header row
    cols = row.find_all('td')  # Extract each column (cell) in the row
    if cols:  # Ensure the row has data
        company_name = cols[1].text.strip()  # Correct column for company name
        link = cols[1].find('a')['href'] if cols[1].find('a') else None  # Link to Wikipedia page
        gics_sector = cols[2].text.strip()  # GICS sector
        gics_sub_industry = cols[3].text.strip()  # GICS sub-industry
        headquarters = cols[4].text.strip()  # Headquarters location
        
        # Append the row data
        data.append([
            company_name, 
            f"https://en.wikipedia.org{link}" if link else None,
            gics_sector, 
            gics_sub_industry, 
            headquarters
        ])

# Create a pandas DataFrame
df = pd.DataFrame(data, columns=[
    "Company Name", 
    "Link to Company Wikipedia page", 
    "GICS Sector", 
    "GICS Sub-Industry", 
    "Headquarters Location"
])

In [72]:
# Display the first five rows
print(df.head())

          Company Name                     Link to Company Wikipedia page  \
0                   3M                   https://en.wikipedia.org/wiki/3M   
1          A. O. Smith          https://en.wikipedia.org/wiki/A._O._Smith   
2  Abbott Laboratories  https://en.wikipedia.org/wiki/Abbott_Laboratories   
3               AbbVie               https://en.wikipedia.org/wiki/AbbVie   
4            Accenture            https://en.wikipedia.org/wiki/Accenture   

              GICS Sector               GICS Sub-Industry  \
0             Industrials        Industrial Conglomerates   
1             Industrials               Building Products   
2             Health Care           Health Care Equipment   
3             Health Care                   Biotechnology   
4  Information Technology  IT Consulting & Other Services   

     Headquarters Location  
0    Saint Paul, Minnesota  
1     Milwaukee, Wisconsin  
2  North Chicago, Illinois  
3  North Chicago, Illinois  
4          Dublin, Irelan

## 3 Add Detailed Information to the S&P 500 DataFrame

In [74]:
# Define new columns for additional company information
new_columns = [
    "Industry", 
    "Revenue (Billions USD)", 
    "Net Income (Billions USD)", 
    "Number of Employees", 
    "Market Cap (Billions USD)", 
    "CEO", 
    "Founder(s)", 
    "Founded (Year)"
]

# Initialize the new columns in the DataFrame with None
for col in new_columns:
    df[col] = None

# Loop through each company and enrich the DataFrame
for i, row in df.iterrows():
    try:
        # Get the Wikipedia URL for the company
        url = row['Link to Company Wikipedia page']
        if not url:
            print(f"Skipping {row['Company Name']} (No URL available)")
            continue

        print(f"Fetching details for {row['Company Name']} ({i + 1}/{len(df)})...")

        # Retrieve company information using the scraper function
        company_info = get_company_info_with_key_people(url)
        
        # Update the DataFrame with the retrieved data
        for col in new_columns:
            df.at[i, col] = company_info.get(col, None)  # Use `.get` to handle missing keys gracefully

    except Exception as e:
        print(f"Failed to retrieve data for {row['Company Name']}: {e}")

# Save the enriched DataFrame to a CSV file
output_file = "sp500_enriched.csv"
df.to_csv(output_file, index=False)
print(f"Enriched S&P 500 DataFrame has been saved to {output_file}.")


Fetching details for 3M (1/503)...
Fetching details for A. O. Smith (2/503)...
Fetching details for Abbott Laboratories (3/503)...
Fetching details for AbbVie (4/503)...
Fetching details for Accenture (5/503)...
Fetching details for Adobe Inc. (6/503)...
Fetching details for Advanced Micro Devices (7/503)...
Fetching details for AES Corporation (8/503)...
Fetching details for Aflac (9/503)...
Fetching details for Agilent Technologies (10/503)...
Fetching details for Air Products (11/503)...
Fetching details for Airbnb (12/503)...
Fetching details for Akamai Technologies (13/503)...
Fetching details for Albemarle Corporation (14/503)...
Fetching details for Alexandria Real Estate Equities (15/503)...
Fetching details for Align Technology (16/503)...
Fetching details for Allegion (17/503)...
Fetching details for Alliant Energy (18/503)...
Fetching details for Allstate (19/503)...
Fetching details for Alphabet Inc. (Class A) (20/503)...
Fetching details for Alphabet Inc. (Class C) (21/503