# Projeto 2 - Como pegar dados de um site com Python? - Pegando dados de ETFs do mundo inteiro.


### Desafio:

* Construir um código que vá no site etf.com e busque dados de todos os etfs do mercado americano e, consequentemente, do mundo. Rentabilidades, patrimônio, gestora, taxa...   
* Lembrar de sempre trazer outros tipos de cenários onde a pessoa precisa pegar dados de sites etc.


### Passo a passo:

   **Passo 1** - Definir um navegador que você irá utilizar para navegar com o Python.

   **Passo 2** - Importar os módulos e bibliotecas.
   
   **Passo 3** - Entender como funcionam requisições na internet.
   
   **Passo 4** - Conhecer e mapear o processo de coleta de dados no site do ETF.com. 
   
   **Passo 5** - Achar todos os elementos necessários dentro do HTML do site.
   
   **Passo 6** - Ler a tabela de dados.
   
   **Passo 7** - Construir a tabela final.

In [None]:
!pip install webdriver-manager

In [None]:
!pip install selenium

In [None]:
!pip install html5lib

# Passo 1: Escolher o navegador.

No nosso caso, utilizaremos o Google Chrome. 

# Passo 2: Importar as bibliotecas.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

# Passo 3: Entender como funcionam requisições na internet.

In [None]:
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()))

url = "https://www.etf.com/etfanalytics/etf-finder"

driver.get(url)

# Passo 4: Conhecer e mapear o processo de coleta de dados no site do ETF.com.

www.etf.com

# Processo de dados

* Abrir o site - Completo
* Mudar a visualização pra 100
* Ler a tabela
* Avançar todas as páginas
* Trocar pra outra categoria
* Ler todas as tabelas dessa outra categoria

# Passo 5.1: Achar todos os elementos necessários dentro do HTML do site - Expandindo a tabela para 100 itens.

In [None]:
time.sleep(5)

botao_100 = driver.find_element("xpath", '''html/body/div[5]/section/div/div[3]/section/div
                                                /div/div/div/div[2]/
                                section[2]/div[2]/section[2]/div[1]/div/div[4]/button/label/span''')

driver.execute_script("arguments[0].click();", botao_100)

# Passo 5.2: Achar todos os elementos necessários dentro do HTML do site - Pegando o número de páginas da tabela.

In [None]:
numero_paginas = driver.find_element("xpath", '''/html/body/div[5]/section/div/div[3]/
section/div/div/div/div/div[2]/section[2]/div[2]/section[2]/div[2]/div/label[2]''')

numero_paginas = numero_paginas.text.replace("of ", "")

numero_paginas = int(numero_paginas)

print(numero_paginas)

# Passo 6.1: Lendo a tabela de dados - Lendo a tabela de dados básicos.

In [None]:
lista_de_tabela_por_pagina = []

for pagina in range(0, numero_paginas):
    
    tabela = driver.find_element("xpath", '''/html/body/div[5]/section/div/div[3]/section/div/
                                    div/div/div/div[2]/section[2]/div[2]/div/table''')

    html_tabela = tabela.get_attribute("outerHTML")

    tabela_final = pd.read_html(html_tabela)[0]
    
    lista_de_tabela_por_pagina.append(tabela_final)
    
    botao_avancar_pagina = driver.find_element("xpath", '//*[@id="nextPage"]')
    
    driver.execute_script("arguments[0].click();", botao_avancar_pagina)
    

base_de_dados_completa = pd.concat(lista_de_tabela_por_pagina)

display(base_de_dados_completa)

# Passo 6.2: Ler a tabela de dados - Lendo a tabela de dados de rentabilidade.

In [None]:
#primeira coisa é mudar a aba

botao_aba = driver.find_element("xpath", ''' /html/body/div[5]/section/div/div[3]/section/div/div/div/div/
div[2]/section[2]/div[2]/ul/li[2]/span''')

driver.execute_script("arguments[0].click();", botao_aba)

#voltando a 1 página

for pagina in range(0, numero_paginas):
    
    botao_voltar_pagina = driver.find_element("xpath", '//*[@id="previousPage"]')
    
    driver.execute_script("arguments[0].click();", botao_voltar_pagina)

In [None]:
lista_de_tabela_por_pagina = []

for pagina in range(0, numero_paginas):
    
    tabela = driver.find_element("xpath", '''/html/body/div[5]/section/div/div[3]/section/div/
                                    div/div/div/div[2]/section[2]/div[2]/div/table''')

    html_tabela = tabela.get_attribute("outerHTML")

    tabela_final = pd.read_html(html_tabela)[0]
    
    lista_de_tabela_por_pagina.append(tabela_final)
    
    botao_avancar_pagina = driver.find_element("xpath", '//*[@id="nextPage"]')
    
    driver.execute_script("arguments[0].click();", botao_avancar_pagina)
    

base_de_dados_performance = pd.concat(lista_de_tabela_por_pagina)



In [64]:
driver.quit()

In [None]:
base_de_dados_performance

# Passo 7: Construindo a tabela final.

In [None]:
base_de_dados_completa = base_de_dados_completa.set_index("Ticker")

base_de_dados_completa

In [59]:
base_de_dados_performance = base_de_dados_performance.set_index("Ticker")
base_de_dados_performance = base_de_dados_performance[['1 Year', '5 Years', '10 Years']]

base_de_dados_performance

Unnamed: 0_level_0,1 Year,5 Years,10 Years
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LYFE,-1.80%,--,--
EGIS,2.38%,--,--
PRNT,-28.09%,-3.51%,--
SMIG,-3.34%,--,--
PFLD,-8.24%,--,--
...,...,...,...
HYLB,-5.99%,2.33%,--
OARK,--,--,--
TSLY,--,--,--
ZECP,-6.60%,--,--


In [60]:
base_de_dados_final = base_de_dados_completa.join(base_de_dados_performance)

In [63]:
display(base_de_dados_completa)
display(base_de_dados_performance)

Unnamed: 0_level_0,Name,Segment,Issuer,Expense Ratio,AUM
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LYFE,2ndVote Life Neutral Plus ETF,Equity: U.S. - Total Market,"2nd Vote Value Investments, Inc",0.75%,$17.54M
EGIS,2ndVote Society Defended ETF,Equity: U.S. - Total Market,"2nd Vote Value Investments, Inc",0.75%,$26.10M
PRNT,3D Printing ETF,Equity: Global Robotics & AI,ARK,0.66%,$187.71M
SMIG,AAM Bahl & Gaynor Small/Mid Cap Income Growth ETF,Equity: U.S. - Extended Market,Advisors Asset Management,0.60%,$173.39M
PFLD,AAM Low Duration Preferred and Income Securiti...,"Fixed Income: U.S. - Corporate, Preferred Shor...",Advisors Asset Management,0.45%,$197.08M
...,...,...,...,...,...
HYLB,Xtrackers USD High Yield Corporate Bond ETF,"Fixed Income: U.S. - Corporate, Broad-based Hi...",DWS,0.15%,$4.14B
OARK,YieldMax Innovation Option Income Strategy ETF,Equity: Global Broad Thematic,Tidal,0.99%,$483.93K
TSLY,YieldMax TSLA Option Income Strategy ETF,Equity: U.S. Automobile Manufacturers,Tidal,0.99%,$386.59K
ZECP,Zacks Earnings Consistent Portfolio ETF,Equity: U.S. - Total Market,Zacks,0.55%,$21.24M


Unnamed: 0_level_0,1 Year,5 Years,10 Years
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LYFE,-1.80%,--,--
EGIS,2.38%,--,--
PRNT,-28.09%,-3.51%,--
SMIG,-3.34%,--,--
PFLD,-8.24%,--,--
...,...,...,...
HYLB,-5.99%,2.33%,--
OARK,--,--,--
TSLY,--,--,--
ZECP,-6.60%,--,--


In [61]:
base_de_dados_final

Unnamed: 0_level_0,Name,Segment,Issuer,Expense Ratio,AUM,1 Year,5 Years,10 Years
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LYFE,2ndVote Life Neutral Plus ETF,Equity: U.S. - Total Market,"2nd Vote Value Investments, Inc",0.75%,$17.54M,-1.80%,--,--
EGIS,2ndVote Society Defended ETF,Equity: U.S. - Total Market,"2nd Vote Value Investments, Inc",0.75%,$26.10M,2.38%,--,--
PRNT,3D Printing ETF,Equity: Global Robotics & AI,ARK,0.66%,$187.71M,-28.09%,-3.51%,--
SMIG,AAM Bahl & Gaynor Small/Mid Cap Income Growth ETF,Equity: U.S. - Extended Market,Advisors Asset Management,0.60%,$173.39M,-3.34%,--,--
PFLD,AAM Low Duration Preferred and Income Securiti...,"Fixed Income: U.S. - Corporate, Preferred Shor...",Advisors Asset Management,0.45%,$197.08M,-8.24%,--,--
...,...,...,...,...,...,...,...,...
HYLB,Xtrackers USD High Yield Corporate Bond ETF,"Fixed Income: U.S. - Corporate, Broad-based Hi...",DWS,0.15%,$4.14B,-5.99%,2.33%,--
OARK,YieldMax Innovation Option Income Strategy ETF,Equity: Global Broad Thematic,Tidal,0.99%,$483.93K,--,--,--
TSLY,YieldMax TSLA Option Income Strategy ETF,Equity: U.S. Automobile Manufacturers,Tidal,0.99%,$386.59K,--,--,--
ZECP,Zacks Earnings Consistent Portfolio ETF,Equity: U.S. - Total Market,Zacks,0.55%,$21.24M,-6.60%,--,--


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

url = 'https://www.etf.com/etfanalytics/etf-finder'

driver.get(url)

time.sleep(5)

cem_resultados = driver.find_element("xpath", 
                    '''/html/body/div[5]/section/div/div[3]/section/div/
                    div/div/div/div[2]/section[2]/div[2]/section[2]/div[1]/div/div[4]/button/label/span''')

driver.execute_script("arguments[0].click();", cem_resultados)

time.sleep(2)

numero_paginas = driver.find_element("xpath", '''//*[@id="totalPages"]''')

numero_paginas = numero_paginas.text.replace("of ", "")

numero_paginas = int(numero_paginas)

lista_data_frames_por_pagina = []

for pagina in range(0, numero_paginas):

    local_tabela = '''
    //*[@id="finderTable"]
    '''

    elemento = driver.find_element("xpath", local_tabela)

    html_tabela = elemento.get_attribute('outerHTML')

    tabela = pd.read_html(html_tabela)[0]
    
    lista_data_frames_por_pagina.append(tabela)
    
    if pagina != numero_paginas:
    
        botao_avancar_pag = driver.find_element("xpath", '//*[@id="nextPage"]')

        driver.execute_script("arguments[0].click();", botao_avancar_pag)


    

base_dados_etf_cadastro = pd.concat(lista_data_frames_por_pagina)

#voltando tudo

for pagina in range(0, numero_paginas):

    botao_voltar_pag = driver.find_element("xpath", '//*[@id="previousPage"]')

    driver.execute_script("arguments[0].click();", botao_voltar_pag)

# a unica coisa que muda é esse botão
botao_dados_perf = driver.find_element("xpath", '/html/body/div[5]/section/div/div[3]/section/div/div/div/div/div[2]/section[2]/div[2]/ul/li[2]/span')


# a partir daqui é tudo igual
driver.execute_script("arguments[0].click();", botao_dados_perf)

lista_data_frames_por_pagina = []

for pagina in range(0, numero_paginas):


    local_tabela = '''
    //*[@id="finderTable"]
    '''

    elemento = driver.find_element("xpath", local_tabela)

    html_tabela = elemento.get_attribute('outerHTML')

    tabela = pd.read_html(html_tabela)[0]
    
    lista_data_frames_por_pagina.append(tabela)
    
    if pagina != numero_paginas:
    
        botao_avancar_pag = driver.find_element("xpath", '//*[@id="nextPage"]')

        driver.execute_script("arguments[0].click();", botao_avancar_pag)
    

base_dados_etf_performance = pd.concat(lista_data_frames_por_pagina)

#fechando navegador
driver.quit()

base_dados_etf_cadastro = base_dados_etf_cadastro.set_index('Ticker')
base_dados_etf_performance = base_dados_etf_performance.set_index("Ticker")
base_dados_etf_performance = base_dados_etf_performance[['1 Year', '5 Years', '10 Years']]

base_dados_final = base_dados_etf_cadastro.join(base_dados_etf_performance, how = "inner")

base_dados_final