## Stocks Income statements - ETL Process

In [2]:
# Import Libraries
import time
from datetime import datetime
from splinter import Browser
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import pymongo

############################################################

#Delete existing Database before beginning the ETL process
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
client.Stocks_db.income_statements.drop()

############################################################

###############  CBA  ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'CBA'

url = f'https://au.finance.yahoo.com/quote/{stock}.AX/financials?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
income_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
income_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Get the table headers
table_headers_list = []
for header in income_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)
        
# Get all the table rows
table_rows = []
row_list = []
for rows in income_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Quit the browser
browser.quit()

# Convert the result into a DataFrame
income_statements = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
#Find empty or NaN entry in Dataframe
missing_cols, missing_rows = (
    (income_statements.isnull().sum(x) | income_statements.eq('').sum(x))
    .loc[lambda x: x.gt(0)].index
    for x in (0, 1)
)
income_statements.loc[missing_rows, missing_cols]

# Delete row with index label '1' and set new index 
Income_statements_df = income_statements.drop([1]).set_index('Breakdown')
# Replace all fields containing '-' with 0
Income_statements_df = Income_statements_df.replace('-',0)
Income_statements_df

# Create Series per year
#2017-2018
Income_statements_df_2018 = Income_statements_df[['29/06/2018']].copy()
#2018-2019
Income_statements_df_2019 = Income_statements_df[['29/06/2019']].copy()
#2019-2020
Income_statements_df_2020 = Income_statements_df[['29/06/2020']].copy()
#2020-2021
Income_statements_df_2021 = Income_statements_df[['29/06/2021']].copy()
# TTM
Income_statements_df_ttm = Income_statements_df[['ttm']].copy()

# Convert them to dictionaries
Income_statements_2018_dict = Income_statements_df_2018.to_dict()['29/06/2018']
Income_statements_2019_dict = Income_statements_df_2019.to_dict()['29/06/2019']
Income_statements_2020_dict = Income_statements_df_2020.to_dict()['29/06/2020']
Income_statements_2021_dict = Income_statements_df_2021.to_dict()['29/06/2021']
Income_statements_ttm_dict =  Income_statements_df_ttm.to_dict()['ttm']
##### Transform #####

##### Load #####
#Convert series into dictionaries and group it into a one report/object
stock_income_statements={'29/06/2018':Income_statements_2018_dict,
                              '29/06/2019':Income_statements_2019_dict,
                               '29/06/2020':Income_statements_2020_dict,
                              '29/06/2021':Income_statements_2021_dict,
                                'ttm':Income_statements_ttm_dict}
stock_reports={}
stock_reports[stock] = stock_income_statements

#Insert object into MongoDB
client.Stocks_db.income_statements.insert_one(stock_reports)
##### Load #####
###############  CBA   ###############

############################################################

###############  ANZ   ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'ANZ'

url = f'https://au.finance.yahoo.com/quote/{stock}.AX/financials?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
income_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
income_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Get the table headers
table_headers_list = []
for header in income_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in income_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Quit the browser
browser.quit()

# Convert the result into a DataFrame
income_statements = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
#Find empty or NaN entry in Dataframe
missing_cols, missing_rows = (
    (income_statements.isnull().sum(x) | income_statements.eq('').sum(x))
    .loc[lambda x: x.gt(0)].index
    for x in (0, 1)
)
income_statements.loc[missing_rows, missing_cols]

# Delete row with index label '1' and set new index 
Income_statements_df = income_statements.drop([1]).set_index('Breakdown')

# Replace all fields containing '-' with 0
Income_statements_df = Income_statements_df.replace('-',0)

#Create Dataframe/Series per year
#2017-2018
Income_statements_df_2018 = Income_statements_df[['29/09/2018']].copy()
#2018-2019
Income_statements_df_2019 = Income_statements_df[['29/09/2019']].copy()
#2019-2020
Income_statements_df_2020 = Income_statements_df[['29/09/2020']].copy()
# TTM
Income_statements_df_ttm = Income_statements_df[['ttm']].copy()

# Convert series to dictionaries
Income_statements_2018_dict = Income_statements_df_2018.to_dict()['29/09/2018']
Income_statements_2019_dict = Income_statements_df_2019.to_dict()['29/09/2019']
Income_statements_2020_dict = Income_statements_df_2020.to_dict()['29/09/2020']
Income_statements_ttm_dict = Income_statements_df_ttm.to_dict()['ttm']
##### Transform #####

##### Load #####
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#Convert dataframe into dict and group it into a one report/object
stock_income_statements = {'29/09/2018': Income_statements_2018_dict,
                              '29/09/2019': Income_statements_2019_dict,
                               '29/09/2020': Income_statements_2020_dict,
                                'ttm': Income_statements_ttm_dict}
stock_reports = {}
stock_reports[stock] = stock_income_statements

#Insert object into MongoDB
client.Stocks_db.income_statements.insert_one(stock_reports)
##### Load #####
###############  ANZ   ###############

[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [C:\Users\James\.wdm\drivers\chromedriver\win32\95.0.4638.17\chromedriver.exe] found in cache
[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [C:\Users\James\.wdm\drivers\chromedriver\win32\95.0.4638.17\chromedriver.exe] found in cache


<pymongo.results.InsertOneResult at 0x23a4632cec0>