In [None]:
"""
There is a list of most active Stocks on Yahoo Finance https://finance.yahoo.com/most-active.
You need to compose several sheets based on data about companies from this list.
To fetch data from webpage you can use requests lib. To parse html you can use beautiful soup lib or lxml.
Sheets which are needed:
1. 5 stocks with most youngest CEOs and print sheet to output. You can find CEO info in Profile tab of concrete stock.
    Sheet's fields: Name, Code, Country, Employees, CEO Name, CEO Year Born.
2. 10 stocks with best 52-Week Change. 52-Week Change placed on Statistics tab.
    Sheet's fields: Name, Code, 52-Week Change, Total Cash
3. 10 largest holds of Blackrock Inc. You can find related info on the Holders tab.
    Blackrock Inc is an investment management corporation.
    Sheet's fields: Name, Code, Shares, Date Reported, % Out, Value.
    All fields except first two should be taken from Holders tab.


Example for the first sheet (you need to use same sheet format):
==================================== 5 stocks with most youngest CEOs ===================================
| Name        | Code | Country       | Employees | CEO Name                             | CEO Year Born |
---------------------------------------------------------------------------------------------------------
| Pfizer Inc. | PFE  | United States | 78500     | Dr. Albert Bourla D.V.M., DVM, Ph.D. | 1962          |
...

About sheet format:
- sheet title should be aligned to center
- all columns should be aligned to the left
- empty line after sheet

Write at least 2 tests on your choose.
Links:
    - requests docs: https://docs.python-requests.org/en/latest/
    - beautiful soup docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
    - lxml docs: https://lxml.de/
"""

   

In [13]:
import requests
from bs4 import BeautifulSoup


In [14]:
profile_page = f'https://finance.yahoo.com/quote/BLK/holders/'
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
        }
response = requests.get(profile_page, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')

response.text # prints whole context

In [17]:
with open("webpage_content.txt", "w", encoding="utf-8") as file:
    file.write(response.text)

print("HTML content saved to webpage_content.txt")

HTML content saved to webpage_content.txt


In [18]:
import pandas as pd
from bs4 import BeautifulSoup

# Step 1: Read the HTML content from the saved text file
with open("webpage_content.txt", "r", encoding="utf-8") as file:
    html_content = file.read()

# Step 2: Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Step 3: Locate the table by its 'data-testid' attribute
table_section = soup.find('section', {'data-testid': 'holders-top-institutional-holders'})

# Find the table within the section
table = table_section.find('table')

# Extract the headers from the table
headers = [header.text for header in table.find_all('th')]

# Extract the rows from the table
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    columns = row.find_all('td')
    rows.append([col.text.strip() for col in columns])

# Step 4: Convert the data into a pandas DataFrame
df = pd.DataFrame(rows, columns=headers)

# Display the DataFrame
print(df)


                                       Holder  Shares Date Reported  % Out  \
0                          Vanguard Group Inc  13.22M  Jun 30, 2024  8.92%   
1                              Blackrock Inc.   9.52M  Jun 30, 2024  6.42%   
2                    State Street Corporation   5.94M  Jun 30, 2024  4.01%   
3          Temasek Holdings (Private) Limited   5.13M  Jun 30, 2024  3.47%   
4                 Bank of America Corporation   5.13M  Jun 30, 2024  3.47%   
5           Capital Research Global Investors   4.53M  Jun 30, 2024  3.06%   
6                              Morgan Stanley   4.34M  Jun 30, 2024  2.93%   
7  Charles Schwab Investment Management, Inc.   3.77M  Jun 30, 2024  2.54%   
8                     Capital World Investors   3.22M  Jun 30, 2024  2.17%   
9               Geode Capital Management, LLC   2.79M  Jun 30, 2024  1.88%   

            Value  
0  12,439,142,001  
1   8,956,277,644  
2   5,591,683,539  
3   4,831,672,332  
4   4,831,626,212  
5   4,266,209,597  
6

In [19]:
df

Unnamed: 0,Holder,Shares,Date Reported,% Out,Value
0,Vanguard Group Inc,13.22M,"Jun 30, 2024",8.92%,12439142001
1,Blackrock Inc.,9.52M,"Jun 30, 2024",6.42%,8956277644
2,State Street Corporation,5.94M,"Jun 30, 2024",4.01%,5591683539
3,Temasek Holdings (Private) Limited,5.13M,"Jun 30, 2024",3.47%,4831672332
4,Bank of America Corporation,5.13M,"Jun 30, 2024",3.47%,4831626212
5,Capital Research Global Investors,4.53M,"Jun 30, 2024",3.06%,4266209597
6,Morgan Stanley,4.34M,"Jun 30, 2024",2.93%,4087741098
7,"Charles Schwab Investment Management, Inc.",3.77M,"Jun 30, 2024",2.54%,3546994120
8,Capital World Investors,3.22M,"Jun 30, 2024",2.17%,3029014555
9,"Geode Capital Management, LLC",2.79M,"Jun 30, 2024",1.88%,2622638511
