In [60]:
import os
import pandas as pd
import time
import re
import csv
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement

In [64]:
URL = "https://loterias.caixa.gov.br/Paginas/Mega-Sena.aspx"
XPATH_ROWS = '//*[@id="ctl50_g_cf05b8d5_fd75_46b5_bdfa_a623e654362c"]/div/div/table/tbody'
X_PATH_THEAD = '//*[@id="ctl50_g_cf05b8d5_fd75_46b5_bdfa_a623e654362c"]/div/div/table/thead'
X_PATH_THEAD_TR = '//*[@id="ctl50_g_cf05b8d5_fd75_46b5_bdfa_a623e654362c"]/div/div/table/thead/tr'
FILE_NAME = 'original-data.csv'

In [65]:
remove_dot = re.compile(r'\.')
remove_dollar = re.compile(r'^[R]\$')
remove_two_words = re.compile(r'^(\w{2})$')
remove_currency = re.compile(r'^[R]\$\d+\,\d{2}$')
remove_no = re.compile(r'^(N|n)\w{2}$')
replace_dot_by_comma = re.compile('\,')

In [66]:
def is_file(filename: str) -> bool:
    return os.path.isfile(os.path.join(filename))

In [67]:
def saveCsvHeaders(filename, data) -> None:
    with open(filename, mode='w', encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(data)
        f.close()

In [68]:
def saveCsvData(filename, data) -> None:
    with open(filename, mode='r', encoding='utf-8') as freader:
        reader = csv.reader(freader)
        next(reader, None)

        with open(filename, mode='a', encoding='utf-8') as fwriter:
            writer = csv.writer(fwriter, delimiter=';')
            writer.writerow(data)

    freader.close()
    fwriter.close()

In [69]:
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(options=option)
driver.get(URL)

In [70]:
driver.title

'Mega-Sena - Portal Loterias | CAIXA'

In [71]:
a = driver.find_element(By.CLASS_NAME, 'zeta')
driver.implicitly_wait(30)
driver.execute_script("arguments[0].click();", a)

In [72]:
driver.window_handles[-1]

'CDwindow-388DCCB2C6EBF328C8C66C2A7813565A'

In [73]:
driver.switch_to.window(driver.window_handles[-1])

In [74]:
driver.title

'Download de Resultados - Portal Loterias | CAIXA'

In [75]:
thead = driver.find_element(By.XPATH, X_PATH_THEAD)
tr = thead.find_element(By.XPATH, X_PATH_THEAD_TR)
thead_rows = tr.find_elements(By.TAG_NAME, 'th')

In [1]:
thead_columns: list[str] = []
for column in thead_rows:
    thead_columns.append(column.text)

NameError: name 'thead_rows' is not defined

In [106]:
saveCsvHeaders(FILE_NAME, thead_columns)

In [77]:
rows = driver.find_elements(By.XPATH, XPATH_ROWS)

In [82]:
def currency_to_number(replace_dot_by_comma, remove_dot, remove_dollar, text) -> str:
    return re.sub(replace_dot_by_comma, '.', re.sub(remove_dot, '', re.sub(remove_dollar, '', text)))

In [83]:
def cleanTexts(element: WebElement, thead_rows) -> list:
    """ Clean text from web data """
    
    lists = [None] * len(thead_rows)
    
    tds = element.find_elements(By.TAG_NAME, 'td')
    cities = [td.text.replace('\n', '::').replace(' ', ':') for td in tds[14].find_elements(By.TAG_NAME, 'tbody')]
    
    lists[0] = int(tds[0].text)
    lists[1] = datetime.strptime(tds[1].text, "%d/%m/%Y")
    lists[2] = int(tds[2].text)
    lists[3] = int(tds[3].text)
    lists[4] = int(tds[4].text)
    lists[5] = int(tds[5].text)
    lists[6] = int(tds[6].text)
    lists[7] = int(tds[7].text)
    lists[8] = tds[8].text
    lists[9] = tds[9].text
    lists[10] = tds[10].text
    lists[11] = currency_to_number(replace_dot_by_comma, remove_dot, remove_dollar, tds[11].text)
    lists[12] = currency_to_number(replace_dot_by_comma, remove_dot, remove_dollar, tds[12].text)
    lists[13] = currency_to_number(replace_dot_by_comma, remove_dot, remove_dollar, tds[13].text)
    lists[14] =  '-' if not cities[0] else cities[0].upper()
    lists[15] =  currency_to_number(replace_dot_by_comma, remove_dot, remove_dollar, 'R$0,00' if not tds[-6].text else tds[-6].text)
    lists[16] = currency_to_number(replace_dot_by_comma, remove_dot, remove_dollar, 'R$0,00' if not tds[-5].text else re.sub(remove_two_words, 'R$0,00', tds[-5].text))
    lists[17] = currency_to_number(replace_dot_by_comma, remove_dot, remove_dollar, 'R$0,00' if not tds[-4].text else tds[-4].text) 
    lists[18] = 'Sim' if not tds[-3].text else re.sub(remove_two_words, 'Não', re.sub(remove_currency, 'Não', tds[-3].text))
    lists[19] = 'Não' if not tds[-2].text else re.sub(remove_currency, 'Não', tds[-2].text)
    lists[20] = '-' if not tds[-1].text else re.sub(remove_currency, '-', re.sub(remove_no, '-', tds[-1].text))

    return lists

In [100]:
def runner(filename: str, thead_rows: list, rows:list[WebElement], from_index = 0) -> None:
    for element in rows[from_index:]:
        saveCsvData(filename, data=cleanTexts(element, thead_rows))
    
    print(f'**** DATA PROCESSING WITH SUCCESS! ****')

In [101]:
def create(filename: str, thead_rows: list, rows:list[WebElement]) -> None:
    print(f"******* CREATE DATA ******")
    
    runner(filename, thead_rows, rows)
    driver.quit()
    
    print(f"******* FINISHED ******")

In [102]:
def update(filename: str, thead_rows: list, rows: list[WebElement]) -> None:
    print(f"******* UPDATE DATA ******")
    
    with open(filename, mode='r', encoding='utf-8') as freader:
        readers = list(csv.reader(freader, delimiter=';'))
        freader.close()
    
    index = readers.index(readers[-1])
    
    if len(rows[index:]) > 0:
        runner(filename, thead_rows, rows, index)

    driver.quit()
    
    print(f"******* FINISHED ******")

In [103]:
def run() -> None:
    """ Run Scraping Mega Sena"""
    update(FILE_NAME, thead_rows, rows) if is_file(FILE_NAME) else create(FILE_NAME, thead_rows, rows)

In [104]:
run()

******* UPDATE DATA ******
**** DATA PROCESSING WITH SUCCESS! ****
******* FINISHED ******
