In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dags.module.object_client import MinioClient
from minio import Minio
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import json
import time

In [2]:
import os
files = os.listdir("./yahoo_data")
list_symbols = [x.split(".")[0] for x in files]
len(list_symbols)

10678

In [7]:
def extract_HTML_content(**kwargs):
    # Arguments for the function
    symbol = kwargs.get("symbol")
    bucket_name = kwargs.get("bucket_name")
    bucket_name_2 = kwargs.get("bucket_name_2")
    
    # Filename
    filename = os.path.join(f"yahoo_data_info/{symbol}.html")
    # if os.path.exists(filename):
    #     return
    
    # URL
    url = f"https://finance.yahoo.com/quote/{symbol}/profile"

    # Retry strategy
    retries = Retry(total=10, backoff_factor=0.1)
    adapter = HTTPAdapter(max_retries=retries)
    
    # Session
    session = requests.Session()
    session.mount('http://', adapter=adapter)
    session.mount('https://', adapter=adapter)
    
    # Requests
    r = session.get(url, headers={"user-agent": "Mozilla/5.0", "from": "youremail@gmail.com"})
    
    # Parse HTML content using BS4
    soup = BeautifulSoup(r.text, "html.parser")
    
    # Minio client
    minio_client = MinioClient("localhost", "9000", access_key="minio", secret_key="minio123")
    
    # HTML content
    data = soup.find("article")
    
    # Check if HTML content is empty
    if data == None:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(str(data))
            print(f"File {filename} is empty. Put into the {bucket_name_2} bucket.")
        minio_client.put_object(bucket_name=bucket_name_2, obj_name=filename, obj_file=filename)
    else:
        # Write into a HTML file
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(str(data.prettify()))
            print(f"File {filename} written successfully. Put into the {bucket_name} bucket.")
        minio_client.put_object(bucket_name=bucket_name, obj_name=filename, obj_file=filename)

In [8]:
for symbol in list_symbols:
    extract_HTML_content(symbol=symbol, bucket_name="bronze", bucket_name_2="trashbin")
    time.sleep(10)

File yahoo_data_info/A.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AA.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AAA.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AAAU.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AACG.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AACI.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AACIU.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AACIW.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AACT.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AADI.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AADR.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AAGR.html written successfully. Put into the bronze bucket.
File yahoo_data_info/AAGRW.html 

In [110]:
def transform_data_from_HTML(symbol, bucket_name_1, bucket_name_2):
    
    # Setup MinIO client
    minio_client = Minio("localhost:9000", "minio", "minio123", secure=False)
    
    # Extract HTML content from landing zone
    html_content = minio_client.get_object(bucket_name=bucket_name_1, object_name=f"yahoo_data_info/{symbol}.html").data.decode('utf-8')
    
    # Setup BS4
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Filename of object
    filename = f"yahoo_data_info/{symbol}.json"
    
    # Profile type variable
    type_text = soup.find("article").\
        find("section", "yf-16025kh reverseColumn twothirds").\
        find("section", "yf-1hj9jti").\
        find("header", "medium mb-4 yf-1trny4b font-condensed").\
        find("h3")
    type = type_text.text.replace(' ', '')
    
    # Data extraction holder
    result = dict()
    
    # Corpo stock
    if type == "KeyExecutives":
        description = soup.find("article").\
            find("section", "yf-16025kh reverseColumn twothirds").\
            find("section", "yf-1hj9jti").\
            find("div").text.replace('\n', ' ').replace('                 ', '').replace('             ', '')
        
        # Check if the profile is empty
        if description == None:
            result = {
                    "Sector": "NaN",
                    "Industry": "NaN",
                    "Description": "NaN"
                }
            
        # Profile not empty, proceed to extract data from elements
        else:
            content = soup.find("article").\
                    find("section", "yf-16025kh reverseColumn twothirds").\
                    find_all("section")[1].find("dl").\
                    find_all("div")
            sector = content[0].find("dd").find("a").text.replace(' ', '').replace('\n', '')
            industry = content[1].find("a").text.replace(' ', '').replace('\n', '')
            
            # Data into a dictionary
            result = {
                "Sector": sector,
                "Industry": industry,
                "Description": description
            }
            
    # ETF stock
    else:
        summary = soup.find("article").\
            find("section", class_="yf-16025kh").\
            find_all("section")[0].find("p").text
        
        overview = soup.find("article").\
            find("section", class_="yf-16025kh").\
            find_all("section")[1].find("div").\
            find("table").find("tbody").find_all("tr")
            
        category = overview[0].find_all("td")[1].text
        fund_family = overview[1].find_all("td")[1].text
        net_assets = overview[2].find_all("td")[1].text
        legal_type = overview[5].find_all("td")[1].text
        
        result = {
            "Category": category,
            "Fund Family": fund_family,
            "Net Assets": net_assets,
            "Legal Type": legal_type,
            "Summary": summary
        }
    
    # Write data into a JSON file into MinIO staging zone
    with open(filename, 'w') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    minio_client.fput_object(bucket_name=bucket_name_2, object_name=filename, file_path=filename)

In [111]:
transform_data_from_HTML("AAPL", "bronze", "silver")

In [109]:
with open("A.html", "r") as f:
    html_content = f.read()
minio_client = Minio("localhost:9000", "minio", "minio123", secure=False)

# Extract HTML content from landing zone
html_content = minio_client.get_object(
    bucket_name="bronze", object_name=f"yahoo_data_info/AA.html").data.decode('utf-8')
result = dict()
soup = BeautifulSoup(html_content, "html.parser")
description = soup.find("article").\
    find("section", "yf-16025kh reverseColumn twothirds").\
    find("section", "yf-1hj9jti").\
    find("div").text.replace('\n', ' ').replace(
        '                 ', '').replace('             ', '')
content = soup.find("article").\
    find("section", "yf-16025kh reverseColumn twothirds").\
    find_all("section")[1].find("dl").\
    find_all("div")
sector = content[0].find("dd").find(
    "a").text.replace(' ', '').replace('\n', '')
industry = content[1].find("a").text.replace(' ', '').replace('\n', '')
industry
result = {
    "Sector": sector,
    "Industry": industry,
    "Description": description
}

result
filename = "AA.json"
with open(filename, 'w') as f:
    json.dump(result, f, indent=4, ensure_ascii=False)
minio_client.fput_object(bucket_name="silver", object_name=filename, file_path=filename)

<minio.helpers.ObjectWriteResult at 0x1fe68c01780>

In [4]:
import pandas as pd

api_key = 'KFLTLHTFNSZS1BBJ'
url = f'https://www.alphavantage.co/query?function=LISTING_STATUS&apikey={api_key}'
df = pd.read_csv(url)
list(df['symbol'])

['A',
 'AA',
 'AAA',
 'AAAU',
 'AACG',
 'AACI',
 'AACIU',
 'AACIW',
 'AACT',
 'AACT-U',
 'AACT-WS',
 'AADI',
 'AADR',
 'AAGR',
 'AAGRW',
 'AAL',
 'AAMC',
 'AAME',
 'AAN',
 'AAOI',
 'AAON',
 'AAP',
 'AAPB',
 'AAPD',
 'AAPL',
 'AAPR',
 'AAPU',
 'AAPX',
 'AAPY',
 'AAT',
 'AAXJ',
 'AB',
 'ABAT',
 'ABBV',
 'ABCB',
 'ABCL',
 'ABCS',
 'ABEO',
 'ABEQ',
 'ABEV',
 'ABG',
 'ABIO',
 'ABL',
 'ABLLL',
 'ABLLW',
 'ABLV',
 'ABLVW',
 'ABM',
 'ABNB',
 'ABNY',
 'ABOS',
 'ABR',
 'ABR-P-D',
 'ABR-P-E',
 'ABR-P-F',
 'ABSI',
 'ABST',
 'ABT',
 'ABTS',
 'ABUS',
 'ABVC',
 'ABVE',
 'ABVEW',
 'ABVX',
 'AC',
 'ACA',
 'ACAB',
 'ACABU',
 'ACABW',
 'ACAC',
 'ACACU',
 'ACACW',
 'ACAD',
 'ACB',
 'ACCD',
 'ACCO',
 'ACDC',
 'ACEL',
 'ACES',
 'ACET',
 'ACGL',
 'ACGLN',
 'ACGLO',
 'ACHC',
 'ACHL',
 'ACHR',
 'ACHR-WS',
 'ACHV',
 'ACI',
 'ACIC',
 'ACIO',
 'ACIU',
 'ACIW',
 'ACLS',
 'ACLX',
 'ACM',
 'ACMR',
 'ACN',
 'ACNB',
 'ACNT',
 'ACON',
 'ACONW',
 'ACP',
 'ACP-P-A',
 'ACR',
 'ACR-P-C',
 'ACR-P-D',
 'ACRE',
 'ACRS',
 'ACR

In [12]:
import os
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from minio import Minio
from dotenv import dotenv_values
import json
from dags.module.object_client import MinioClient

In [None]:
minio_client = MinioClient("localhost", "9000", "minio", "minio123")
config = dotenv_values("dags/.env")
access_key = config['MINIO_ROOT_USER']
secret_key = config['MINIO_ROOT_PASSWORD']
list_objects = minio_client.list_objects(bucket_name="symbol")
list_symbols = [x.split(".")[0] for x in list_objects]
list_symbols

In [16]:
len(list_symbols)

10678

In [15]:
from dags.module.object_client import MinioClient
from dotenv import dotenv_values
from bs4 import BeautifulSoup

env_var = dotenv_values("dags/.env")
access_key = env_var['MINIO_ROOT_USER']
secret_key = env_var['MINIO_ROOT_PASSWORD']

minio_client = MinioClient("localhost", "9000", access_key=access_key, secret_key=secret_key)
html_content = minio_client.get_object(bucket_name="bronze", obj_name=f"yahoo_data_info/A.html", prefix="yahoo_data_info/").data.decode('utf-8')
soup = BeautifulSoup(html_content, "html.parser")
# html_content_prettify = soup.prettify()
# print(html_content_prettify)
# list_objects
# # html_content
# content = soup.find("article")

In [13]:
type_text = soup.find("article").find("section", class_="yf-16025kh reverseColumn twothirds").find("section", class_="yf-1hj9jti").find("header").find("h3")
    # find("section", class_="yf-1hj9jti").find("h3")
# type = type_text.text.replace('\n', '').replace(' ', '')
type_text.text

'\n     Key Executives\n    '