## Stocks Cash Flow - ETL Process

In [1]:
# Import Libraries
import time
from datetime import datetime
from splinter import Browser
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import pymongo

############################################################

# Delete collection before inserting new data
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
client.Stocks_db.cash_flow.drop()

############################################################

###############  CBA  ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'CBA'
url = f'https://au.finance.yahoo.com/quote/{stock}.AX/cash-flow?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
cashflow_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
cashflow_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Quit the browser
browser.quit()

# Get the table headers
table_headers_list = []
for header in cashflow_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in cashflow_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Convert the result into a DataFrame
cashflow_table = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
# Remove all the rows without values based on any of the financial year columns
cashflow_table_new = cashflow_table[cashflow_table['29/06/2021']!=''].copy()
# Replace all fields containing '-' with 0
cashflow_table_new = cashflow_table_new.replace('-',0)
# Set the index to Breakdown
cashflow_table_new = cashflow_table_new.set_index('Breakdown')

# Divide into series by financial year
#2017-2018
cashflow_table_new_17to18 = cashflow_table_new[['29/06/2018']].copy()
#2018-2019
cashflow_table_new_18to19 = cashflow_table_new[['29/06/2019']].copy()
#2019-2020
cashflow_table_new_19to20 = cashflow_table_new[['29/06/2020']].copy()
#2020-2021
cashflow_table_new_20to21 = cashflow_table_new[['29/06/2021']].copy()
# TTM
cashflow_table_new_ttm = cashflow_table_new[['ttm']].copy()

# Convert them to dictionaries
cashflow_table_new_17to18_dict = cashflow_table_new_17to18.to_dict()['29/06/2018']
cashflow_table_new_18to19_dict = cashflow_table_new_18to19.to_dict()['29/06/2019']
cashflow_table_new_19to20_dict = cashflow_table_new_19to20.to_dict()['29/06/2020']
cashflow_table_new_20to21_dict = cashflow_table_new_20to21.to_dict()['29/06/2021']
cashflow_table_new_ttm_dict = cashflow_table_new_ttm.to_dict()['ttm']
##### Transform #####

##### Load #####
# Load Into Mongodb
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Load all dictionaries into a single dictionary
stock_cash_flow_reports = {'29/06/2018':cashflow_table_new_17to18_dict,
                         '29/06/2019':cashflow_table_new_18to19_dict,
                         '29/06/2020':cashflow_table_new_19to20_dict,
                         '29/06/2021':cashflow_table_new_20to21_dict,
                         'ttm':cashflow_table_new_ttm_dict}

stock_dict = {}

stock_dict[stock] = stock_cash_flow_reports

# Insert into Mongo db
client.Stocks_db.cash_flow.insert_one(stock_dict)
##### Load #####
###############  CBA   ###############

############################################################

###############  ANZ   ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'ANZ'
url = f'https://au.finance.yahoo.com/quote/{stock}.AX/cash-flow?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
cashflow_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
cashflow_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Quit the browser
browser.quit()

# Get the table headers
table_headers_list = []
for header in cashflow_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in cashflow_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Convert the result into a DataFrame
cashflow_table = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####
##### Transform #####
# Remove all the rows without values based on any of the financial year columns
cashflow_table_new = cashflow_table[cashflow_table['29/09/2020']!=''].copy()
# Replace all fields containing '-' with 0
cashflow_table_new = cashflow_table_new.replace('-',0)
# Set the index to Breakdown
cashflow_table_new = cashflow_table_new.set_index('Breakdown')

# Divide into series by financial year
#2017-2018
cashflow_table_new_17to18 = cashflow_table_new[['29/09/2018']].copy()
#2018-2019
cashflow_table_new_18to19 = cashflow_table_new[['29/09/2019']].copy()
#2019-2020
cashflow_table_new_19to20 = cashflow_table_new[['29/09/2020']].copy()
# TTM
cashflow_table_new_ttm = cashflow_table_new[['ttm']].copy()

cashflow_table_new_17to18_dict = cashflow_table_new_17to18.to_dict()['29/09/2018']
cashflow_table_new_18to19_dict = cashflow_table_new_18to19.to_dict()['29/09/2019']
cashflow_table_new_19to20_dict = cashflow_table_new_19to20.to_dict()['29/09/2020']
cashflow_table_new_ttm_dict = cashflow_table_new_ttm.to_dict()['ttm']

##### Transform #####

##### Load #####
# Load Into Mongodb
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

stock_dict = {}

# Load all dictionaries into a single dictionary
stock_cash_flow_reports = {'29/09/2018':cashflow_table_new_17to18_dict,
                         '29/09/2019':cashflow_table_new_18to19_dict,
                         '29/09/2020':cashflow_table_new_19to20_dict,
                         'ttm':cashflow_table_new_ttm_dict}

stock_dict[stock] = stock_cash_flow_reports

# Insert into MongoDB
client.Stocks_db.cash_flow.insert_one(stock_dict)
##### Load #####
###############  ANZ   ###############


[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [C:\Users\James\.wdm\drivers\chromedriver\win32\95.0.4638.17\chromedriver.exe] found in cache
[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [C:\Users\James\.wdm\drivers\chromedriver\win32\95.0.4638.17\chromedriver.exe] found in cache


<pymongo.results.InsertOneResult at 0x26ed08984c0>