In [2]:
import requests
from bs4 import BeautifulSoup

def get_sp500_companies():
    # URL of the Wikipedia page listing S&P 500 companies
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

    # Send an HTTP request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the table containing the list of S&P 500 companies
        table = soup.find('table', {'class': 'wikitable sortable'})

        # Initialize an empty list to store company names
        company_names = []

        # Iterate through rows of the table (excluding the header row)
        for row in table.find_all('tr')[1:]:
            # Get the first column (which contains the company names)
            company_name = row.find_all('td')[1].text.strip()
            company_names.append(company_name)

        return company_names
    else:
        # Print an error message if the request was not successful
        print('Failed to retrieve data from Wikipedia.')
        return None

# Example usage
if __name__ == "__main__":
    sp500_companies = get_sp500_companies()



S&P 500 Companies:
3M
A. O. Smith
Abbott
AbbVie
Accenture
Adobe Inc.
Advanced Micro Devices
AES Corporation
Aflac
Agilent Technologies
Air Products and Chemicals
Airbnb
Akamai
Albemarle Corporation
Alexandria Real Estate Equities
Align Technology
Allegion
Alliant Energy
Allstate
Alphabet Inc. (Class A)
Alphabet Inc. (Class C)
Altria
Amazon
Amcor
Ameren
American Airlines Group
American Electric Power
American Express
American International Group
American Tower
American Water Works
Ameriprise Financial
Ametek
Amgen
Amphenol
Analog Devices
Ansys
Aon
APA Corporation
Apple Inc.
Applied Materials
Aptiv
Arch Capital Group
Archer-Daniels-Midland
Arista Networks
Arthur J. Gallagher & Co.
Assurant
AT&T
Atmos Energy
Autodesk
Automated Data Processing
AutoZone
AvalonBay Communities
Avery Dennison
Axon Enterprise
Baker Hughes
Ball Corporation
Bank of America
Bank of New York Mellon
Bath & Body Works, Inc.
Baxter International
Becton Dickinson
Berkshire Hathaway
Best Buy
Bio-Rad
Bio-Techne
Biogen
Bl

In [10]:
import requests
from bs4 import BeautifulSoup
import re

def get_sp500_company_websites():
    # URL of the Wikipedia page listing S&P 500 companies
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

    # Send an HTTP request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the table containing the list of S&P 500 companies
        table = soup.find('table', {'class': 'wikitable sortable'})

        # Initialize an empty dictionary to store company names and websites
        company_websites = {}

        # Iterate through rows of the table (excluding the header row)
        for row in table.find_all('tr')[1:]:
            # Get the first column (which contains the company names)
            company_cell = row.find_all('td')[1]
            # Extract the company name and its Wikipedia link
            company_name = company_cell.text.strip()
            company_link = company_cell.find('a')['href'] if company_cell.find('a') else None

            # If the Wikipedia link exists, fetch the content and find the website link
            if company_link:
                company_website = extract_company_website(company_link)
                company_websites[company_name] = company_website

        return company_websites
    else:
        # Print an error message if the request was not successful
        print('Failed to retrieve data from Wikipedia.')
        return None

def extract_company_website(wiki_link):
    # Construct the full URL of the Wikipedia page for the company
    full_url = 'https://en.wikipedia.org' + wiki_link

    # Send an HTTP request to the company's Wikipedia page
    response = requests.get(full_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the infobox table containing the company details
        infobox_table = soup.find('table', {'class': 'infobox vcard'})

        # Search for the website link within the infobox
        if infobox_table:
            for row in infobox_table.find_all('tr'):
                th = row.find('th')
                if th and th.text.strip().lower() == 'website':
                    td = row.find('td')
                    if td:
                        website_link = td.find('a')
                        if website_link:
                            return website_link['href']
        else:
            # Search for website link in the page content if not found in the infobox
            page_content = soup.find('div', {'class': 'mw-parser-output'})
            if page_content:
                website_regex = re.compile(r'https?://[^\s]+')
                website_links = website_regex.findall(str(page_content))
                if website_links:
                    return website_links[0]  # Return the first website link found

    return None


# Example usage
if __name__ == "__main__":
    sp500_company_websites = get_sp500_company_websites()
    if sp500_company_websites:
        for company, website in sp500_company_websites.items():
            print(f"Company: {company}, Website: {website}")


Company: 3M, Website: http://3m.com
Company: A. O. Smith, Website: http://aosmith.com
Company: Abbott, Website: http://abbott.com
Company: AbbVie, Website: http://abbvie.com
Company: Accenture, Website: https://www.accenture.com
Company: Adobe Inc., Website: https://www.adobe.com/
Company: Advanced Micro Devices, Website: https://amd.com/
Company: AES Corporation, Website: http://aes.com
Company: Aflac, Website: http://aflac.com
Company: Agilent Technologies, Website: http://www.agilent.com
Company: Air Products and Chemicals, Website: http://airproducts.com
Company: Airbnb, Website: https://airbnb.com
Company: Akamai, Website: http://akamai.com
Company: Albemarle Corporation, Website: http://albemarle.com
Company: Alexandria Real Estate Equities, Website: https://www.are.com/
Company: Align Technology, Website: https://aligntech.com
Company: Allegion, Website: http://allegion.com
Company: Alliant Energy, Website: http://www.alliantenergy.com
Company: Allstate, Website: http://allstate

In [12]:
print(sp500_company_websites)

{'3M': 'http://3m.com', 'A. O. Smith': 'http://aosmith.com', 'Abbott': 'http://abbott.com', 'AbbVie': 'http://abbvie.com', 'Accenture': 'https://www.accenture.com', 'Adobe Inc.': 'https://www.adobe.com/', 'Advanced Micro Devices': 'https://amd.com/', 'AES Corporation': 'http://aes.com', 'Aflac': 'http://aflac.com', 'Agilent Technologies': 'http://www.agilent.com', 'Air Products and Chemicals': 'http://airproducts.com', 'Airbnb': 'https://airbnb.com', 'Akamai': 'http://akamai.com', 'Albemarle Corporation': 'http://albemarle.com', 'Alexandria Real Estate Equities': 'https://www.are.com/', 'Align Technology': 'https://aligntech.com', 'Allegion': 'http://allegion.com', 'Alliant Energy': 'http://www.alliantenergy.com', 'Allstate': 'http://allstate.com', 'Alphabet Inc. (Class A)': 'https://abc.xyz/', 'Alphabet Inc. (Class C)': 'https://abc.xyz/', 'Altria': 'https://www.altria.com/', 'Amazon': 'https://www.amazon.com/', 'Amcor': 'http://amcor.com', 'Ameren': 'http://ameren.com', 'American Air

In [14]:
website_links = list(sp500_company_websites.values())
print(website_links[5])

https://www.adobe.com/


In [3]:
print(sp500_companies[0])

3M


In [19]:
pip install openpyxl




In [26]:
from openpyxl import Workbook
from googlesearch import search

# Create a new Excel workbook
wb = Workbook()
ws = wb.active

# Perform Google search
query = '(site:lever.co OR site:greenhouse.io) ("Data Science Intern" OR "Junior Data Scientist") ("internship" OR "1 year of experience")'
results = search(query, num_results=100)

# Write the results to Excel
row = 1
for url in results:
    ws.cell(row, 1, url)
    row += 1
    time.sleep(2)

# Save the workbook
wb.save("search_results.xlsx")

print("Search results have been added to the Excel file.")


HTTPError: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3D%2528site%253Alever.co%252BOR%252Bsite%253Agreenhouse.io%2529%252B%2528%2522Data%252BScience%252BIntern%2522%252BOR%252B%2522Junior%252BData%252BScientist%2522%2529%252B%2528%2522internship%2522%252BOR%252B%25221%252Byear%252Bof%252Bexperience%2522%2529%26num%3D102%26hl%3Den%26start%3D0&hl=en&q=EgQjxRrlGMXmhq4GIjCPZuFlnYa-E2jI8Sdg5hcDuQya68w_UIGNiBNJ3-IquPyNsm3PzrZfhWrqHZYmwI0yAXJaAUM