# Setup

In [None]:
# Import Libraries
import time
from datetime import datetime
from splinter import Browser
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import pymongo
import os

# Set variables for the whole process
user = 'Insert Here'

# Delete Database

In [None]:
# Delete collection before inserting new data
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
client.drop_database('Stocks_db')

## Stocks Summary Table - ETL Process

In [None]:
###############  CBA  ###############
##### Extract #####
# start web browser
browser = webdriver.Chrome(ChromeDriverManager().install())

stock = 'CBA'

# get source code
browser.get(f"https://au.finance.yahoo.com/quote/{stock}.AX?p={stock}.AX")
html = browser.page_source
time.sleep(2)
df = pd.read_html(html)

# close web browser
browser.close()

# Get each table from the table list
summary_table_1 = df[0]
summary_table_2 = df[1]
# Join the tables together
total_summary_table = summary_table_1.append(summary_table_2)
# Rename the columns
total_summary_table = total_summary_table.rename(columns = {0:'Summary Metric', 1:'Value'})
##### Extract #####

##### Transform #####
# Remove all the rows without values based on any of the financial year columns
summary_table_new = total_summary_table[total_summary_table['Value']!=''].copy()

# Replace all fields containing '-' with 0
summary_table_new = summary_table_new.replace('-',0)

# Set the index to Summary Metric
summary_table_new = summary_table_new.set_index('Summary Metric')

# Make it a series
summary_table_stock_value = summary_table_new[['Value']].copy()

# Convert it to a dictionary
summary_df = summary_table_stock_value.to_dict()
##### Transform #####

##### Load #####
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Drop existing Database
client.Stocks_db.summary.drop()

# Insert into database
stock_summary_report={}
stock_summary_report[stock] = summary_df

client.Stocks_db.summary.insert_one(stock_summary_report)
##### Load #####
###############  CBA   ###############

############################################################

###############  ANZ   ###############
##### Extract #####
# start web browser
browser = webdriver.Chrome(ChromeDriverManager().install())

stock = 'ANZ'

# get source code
browser.get(f"https://au.finance.yahoo.com/quote/{stock}.AX?p={stock}.AX")
html = browser.page_source
time.sleep(2)
df = pd.read_html(html)

# close web browser
browser.close()

# Get each table from the table list
summary_table_1 = df[0]
summary_table_2 = df[1]
# Join the tables together
total_summary_table = summary_table_1.append(summary_table_2)
# Rename the columns
total_summary_table = total_summary_table.rename(columns = {0:'Summary Metric', 1:'Value'})
##### Extract #####

##### Transform #####
# Remove all the rows without values based on any of the financial year columns
summary_table_new = total_summary_table[total_summary_table['Value']!=''].copy()

# Replace all fields containing '-' with 0
summary_table_new = summary_table_new.replace('-',0)

# Set the index to Summary Metric
summary_table_new = summary_table_new.set_index('Summary Metric')

# Make it a series
summary_table_stock_value = summary_table_new[['Value']].copy()

# Convert it to a dictionary
summary_df = summary_table_stock_value.to_dict()
##### Transform #####

##### Load #####
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Insert into database
stock_summary_report={}
stock_summary_report[stock] = summary_df

client.Stocks_db.summary.insert_one(stock_summary_report)
##### Load #####
###############  ANZ  ###############

## Stock History Year Month Average - ETL Process

In [None]:
###############  CBA   ###############
##### Extract #####
# Get the current time in unixtime format
d = datetime.today()
unixtime = time.mktime(d.timetuple())
starting_date = round(time.mktime(datetime.strptime("01/10/1991", "%d/%m/%Y").timetuple()))
current_time = round(unixtime)

file_directory = 'Desktop\\yahoo-finance-anz-cba-etl'
stock = 'CBA'

# Change the download directory
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : f"C:\\Users\\{user}\\{file_directory}"}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=chromeOptions)

# Remove the stock history CSV file if it already exists
filename = f'{stock}.AX.csv'
filepath = os.path.join(f"C:\\Users\\{user}\\{file_directory}", filename)
if os.path.exists(filepath):
    os.remove(filepath)

# Get the stock history CSV file
url = f'https://query1.finance.yahoo.com/v7/finance/download/{stock}.AX?period1={starting_date}&period2={current_time}&interval=1d&events=history&includeAdjustedClose=true'
driver.get(url)

time.sleep(5)
driver.quit()
##### Extract #####

##### Transform #####
# Get the stock
df = pd.read_csv(f'{stock}.AX.csv')
# Get the dates and the year of each record
df['Year']= df['Date'].str.slice(start=0,stop=4)
df['Month']= df['Date'].str.slice(start=5,stop=7)
# Remove all the null values
df = df.dropna()
# Get the average 'Open', 'High', 'Low', 'Close', 'Adj Close', and 'Volume' of each month for each year
df_new = df.groupby(['Year','Month']).mean().round(6)
# Round the average 'Volume' to 0 decimal places
df_new['Volume'] = df_new['Volume'].round()
# Round the average 'Open', 'High', 'Low', 'Close', and 'Adj Close' prices to 2 decimal places
df_new[['Open', 'High', 'Low', 'Close', 'Adj Close']] = df_new[['Open', 'High', 'Low', 'Close', 'Adj Close']].round(2)
# Create a new index called 'Year-Month-Average'
df_new = df_new.reset_index()
df_new['Year-Month-Average'] = df_new['Year'] + '-' + df_new['Month']
df_new = df_new.drop(['Year','Month'],axis=1)
df_new = df_new.set_index('Year-Month-Average')
# Transpose the columns
df_new_tran = df_new.copy().T
#Convert to a dictionary
df_dict = df_new_tran.to_dict()
##### Transform #####

##### Load #####
# Load Into Mongodb
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

stock_dict = {}

stock_dict[stock] = df_dict

client.Stocks_db.stock_history_average.insert_one(stock_dict)
##### Load #####
###############  CBA   ###############

###############  ANZ   ###############
##### Extract #####
# Get the current time in unixtime format
d = datetime.today()
unixtime = time.mktime(d.timetuple())
starting_date = round(time.mktime(datetime.strptime("01/10/1991", "%d/%m/%Y").timetuple()))
current_time = round(unixtime)

stock = 'ANZ'

# Change the download directory
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : f"C:\\Users\\{user}\\{file_directory}"}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=chromeOptions)

# Remove the stock history CSV file if it already exists
filename = f'{stock}.AX.csv'
filepath = os.path.join(f"C:\\Users\\{user}\\{file_directory}", filename)
if os.path.exists(filepath):
    os.remove(filepath)

# Get the stock history CSV file
url = f'https://query1.finance.yahoo.com/v7/finance/download/{stock}.AX?period1={starting_date}&period2={current_time}&interval=1d&events=history&includeAdjustedClose=true'
driver.get(url)

time.sleep(5)
driver.quit()
##### Extract #####

##### Transform #####
# Get the stock
df = pd.read_csv(f'{stock}.AX.csv')
# Get the dates and the year of each record
df['Year']= df['Date'].str.slice(start=0,stop=4)
df['Month']= df['Date'].str.slice(start=5,stop=7)
# Remove all the null values
df = df.dropna()
# Get the average 'Open', 'High', 'Low', 'Close', 'Adj Close', and 'Volume' of each month for each year
df_new = df.groupby(['Year','Month']).mean().round(6)
# Round the average 'Volume' to 0 decimal places
df_new['Volume'] = df_new['Volume'].round()
# Round the average 'Open', 'High', 'Low', 'Close', and 'Adj Close' prices to 2 decimal places
df_new[['Open', 'High', 'Low', 'Close', 'Adj Close']] = df_new[['Open', 'High', 'Low', 'Close', 'Adj Close']].round(2)
# Create a new index called 'Year-Month-Average'
df_new = df_new.reset_index()
df_new['Year-Month-Average'] = df_new['Year'] + '-' + df_new['Month']
df_new = df_new.drop(['Year','Month'],axis=1)
df_new = df_new.set_index('Year-Month-Average')
# Transpose the columns
df_new_tran = df_new.copy().T
#Convert to a dictionary
df_dict = df_new_tran.to_dict()
##### Transform #####

##### Load #####
# Load Into Mongodb
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

stock_dict = {}

stock_dict[stock] = df_dict

client.Stocks_db.stock_history_average.insert_one(stock_dict)
##### Load #####
###############  ANZ   ###############

## Stocks Income statements - ETL Process

In [None]:
###############  CBA  ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'CBA'

url = f'https://au.finance.yahoo.com/quote/{stock}.AX/financials?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
income_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
income_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Get the table headers
table_headers_list = []
for header in income_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)
        
# Get all the table rows
table_rows = []
row_list = []
for rows in income_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Quit the browser
browser.quit()

# Convert the result into a DataFrame
income_statements = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
#Find empty or NaN entry in Dataframe
missing_cols, missing_rows = (
    (income_statements.isnull().sum(x) | income_statements.eq('').sum(x))
    .loc[lambda x: x.gt(0)].index
    for x in (0, 1)
)
income_statements.loc[missing_rows, missing_cols]

# Delete row with index label '1' and set new index 
Income_statements_df = income_statements.drop([1]).set_index('Breakdown')
# Replace all fields containing '-' with 0
Income_statements_df = Income_statements_df.replace('-',0)
Income_statements_df

# Create Series per year
#2017-2018
Income_statements_df_2018 = Income_statements_df[['29/06/2018']].copy()
#2018-2019
Income_statements_df_2019 = Income_statements_df[['29/06/2019']].copy()
#2019-2020
Income_statements_df_2020 = Income_statements_df[['29/06/2020']].copy()
#2020-2021
Income_statements_df_2021 = Income_statements_df[['29/06/2021']].copy()
# TTM
Income_statements_df_ttm = Income_statements_df[['ttm']].copy()

# Convert them to dictionaries
Income_statements_2018_dict = Income_statements_df_2018.to_dict()['29/06/2018']
Income_statements_2019_dict = Income_statements_df_2019.to_dict()['29/06/2019']
Income_statements_2020_dict = Income_statements_df_2020.to_dict()['29/06/2020']
Income_statements_2021_dict = Income_statements_df_2021.to_dict()['29/06/2021']
Income_statements_ttm_dict =  Income_statements_df_ttm.to_dict()['ttm']
##### Transform #####

##### Load #####
#Convert series into dictionaries and group it into a one report/object
stock_income_statements={'29/06/2018':Income_statements_2018_dict,
                              '29/06/2019':Income_statements_2019_dict,
                               '29/06/2020':Income_statements_2020_dict,
                              '29/06/2021':Income_statements_2021_dict,
                                'ttm':Income_statements_ttm_dict}
stock_reports={}
stock_reports[stock] = stock_income_statements

#Insert object into MongoDB
client.Stocks_db.income_statements.insert_one(stock_reports)
##### Load #####
###############  CBA   ###############

############################################################

###############  ANZ   ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'ANZ'

url = f'https://au.finance.yahoo.com/quote/{stock}.AX/financials?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
income_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
income_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Get the table headers
table_headers_list = []
for header in income_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in income_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Quit the browser
browser.quit()

# Convert the result into a DataFrame
income_statements = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
#Find empty or NaN entry in Dataframe
missing_cols, missing_rows = (
    (income_statements.isnull().sum(x) | income_statements.eq('').sum(x))
    .loc[lambda x: x.gt(0)].index
    for x in (0, 1)
)
income_statements.loc[missing_rows, missing_cols]

# Delete row with index label '1' and set new index 
Income_statements_df = income_statements.drop([1]).set_index('Breakdown')

# Replace all fields containing '-' with 0
Income_statements_df = Income_statements_df.replace('-',0)

#Create Dataframe/Series per year
#2017-2018
Income_statements_df_2018 = Income_statements_df[['29/09/2018']].copy()
#2018-2019
Income_statements_df_2019 = Income_statements_df[['29/09/2019']].copy()
#2019-2020
Income_statements_df_2020 = Income_statements_df[['29/09/2020']].copy()
# TTM
Income_statements_df_ttm = Income_statements_df[['ttm']].copy()

# Convert series to dictionaries
Income_statements_2018_dict = Income_statements_df_2018.to_dict()['29/09/2018']
Income_statements_2019_dict = Income_statements_df_2019.to_dict()['29/09/2019']
Income_statements_2020_dict = Income_statements_df_2020.to_dict()['29/09/2020']
Income_statements_ttm_dict = Income_statements_df_ttm.to_dict()['ttm']
##### Transform #####

##### Load #####
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

#Convert dataframe into dict and group it into a one report/object
stock_income_statements = {'29/09/2018': Income_statements_2018_dict,
                              '29/069/2019': Income_statements_2019_dict,
                               '29/09/2020': Income_statements_2020_dict,
                                'ttm': Income_statements_ttm_dict}
stock_reports = {}
stock_reports[stock] = stock_income_statements

#Insert object into MongoDB
client.Stocks_db.income_statements.insert_one(stock_reports)
##### Load #####
###############  ANZ   ###############

## Balance Sheets - ETL Process

In [None]:
###############  CBA  ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'CBA'
url = f'https://au.finance.yahoo.com/quote/{stock}.AX/balance-sheet?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
balance_sheet_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
balance_sheet_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Quit the browser
browser.quit()

# Get the table headers
table_headers_list = []
for header in balance_sheet_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in balance_sheet_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Convert the result into a DataFrame
balance_table = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
# Remove all the rows without values based on any of the financial year columns
balance_table_new = balance_table[balance_table['29/06/2021']!=''].copy()
# Replace all fields containing '-' with 0
balance_table_new = balance_table_new.replace('-',0)
# Set the index to Breakdown
balance_table_new = balance_table_new.set_index('Breakdown')

# Divide into series by financial year
#2017-2018
balance_table_new_17to18 = balance_table_new[['29/06/2018']].copy()
#2018-2019
balance_table_new_18to19 = balance_table_new[['29/06/2019']].copy()
#2019-2020
balance_table_new_19to20 = balance_table_new[['29/06/2020']].copy()
#2020-2021
balance_table_new_20to21 = balance_table_new[['29/06/2021']].copy()

# Convert them into dictionaries
balance_table_new_17to18_dict = balance_table_new_17to18.to_dict()['29/06/2018']
balance_table_new_18to19_dict = balance_table_new_18to19.to_dict()['29/06/2019']
balance_table_new_19to20_dict = balance_table_new_19to20.to_dict()['29/06/2020']
balance_table_new_20to21_dict = balance_table_new_20to21.to_dict()['29/06/2021']
##### Transform #####

##### Load #####
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

stock_dict = {}

# Insert into MongoDB
stock_balance_reports = {'29/06/2018':balance_table_new_17to18_dict,
                         '29/06/2019':balance_table_new_18to19_dict,
                         '29/06/2020':balance_table_new_19to20_dict,
                         '29/06/2021':balance_table_new_20to21_dict}

stock_dict[stock] = stock_balance_reports

client.Stocks_db.balance_sheets.insert_one(stock_dict)
##### Load #####
###############  CBA  ###############

############################################################

###############  ANZ  ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'ANZ'
url = f'https://au.finance.yahoo.com/quote/{stock}.AX/balance-sheet?p={stock}.AX&.tsrc=fin-srch'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
balance_sheet_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
balance_sheet_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Quit the browser
browser.quit()

# Get the table headers
table_headers_list = []
for header in balance_sheet_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in balance_sheet_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Convert the result into a DataFrame
balance_table = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
# Remove all the rows without values based on any of the financial year columns
balance_table_new = balance_table[balance_table['29/09/2020']!=''].copy()
# Replace all fields containing '-' with 0
balance_table_new = balance_table_new.replace('-',0)
# Set the index to Breakdown
balance_table_new = balance_table_new.set_index('Breakdown')

# Divide into series by financial year
#2017-2018
balance_table_new_17to18 = balance_table_new[['29/09/2018']].copy()
#2018-2019
balance_table_new_18to19 = balance_table_new[['29/09/2019']].copy()
#2019-2020
balance_table_new_19to20 = balance_table_new[['29/09/2020']].copy()

# Convert to dictionaries
balance_table_new_17to18_dict = balance_table_new_17to18.to_dict()['29/09/2018']
balance_table_new_18to19_dict = balance_table_new_18to19.to_dict()['29/09/2019']
balance_table_new_19to20_dict = balance_table_new_19to20.to_dict()['29/09/2020']
##### Transform #####

##### Load #####
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

stock_dict = {}

# Insert into MongoDB
stock_balance_reports = {'29/06/2018':balance_table_new_17to18_dict,
                         '29/06/2019':balance_table_new_18to19_dict,
                         '29/06/2020':balance_table_new_19to20_dict}

stock_dict[stock] = stock_balance_reports

client.Stocks_db.balance_sheets.insert_one(stock_dict)
##### Load #####
###############  ANZ  ###############

## Stocks Cash Flow - ETL Process

In [None]:
###############  CBA  ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'CBA'
url = f'https://au.finance.yahoo.com/quote/{stock}.AX/cash-flow?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
cashflow_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
cashflow_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Quit the browser
browser.quit()

# Get the table headers
table_headers_list = []
for header in cashflow_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in cashflow_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Convert the result into a DataFrame
cashflow_table = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####

##### Transform #####
# Remove all the rows without values based on any of the financial year columns
cashflow_table_new = cashflow_table[cashflow_table['29/06/2021']!=''].copy()
# Replace all fields containing '-' with 0
cashflow_table_new = cashflow_table_new.replace('-',0)
# Set the index to Breakdown
cashflow_table_new = cashflow_table_new.set_index('Breakdown')

# Divide into series by financial year
#2017-2018
cashflow_table_new_17to18 = cashflow_table_new[['29/06/2018']].copy()
#2018-2019
cashflow_table_new_18to19 = cashflow_table_new[['29/06/2019']].copy()
#2019-2020
cashflow_table_new_19to20 = cashflow_table_new[['29/06/2020']].copy()
#2020-2021
cashflow_table_new_20to21 = cashflow_table_new[['29/06/2021']].copy()
# TTM
cashflow_table_new_ttm = cashflow_table_new[['ttm']].copy()

# Convert them to dictionaries
cashflow_table_new_17to18_dict = cashflow_table_new_17to18.to_dict()['29/06/2018']
cashflow_table_new_18to19_dict = cashflow_table_new_18to19.to_dict()['29/06/2019']
cashflow_table_new_19to20_dict = cashflow_table_new_19to20.to_dict()['29/06/2020']
cashflow_table_new_20to21_dict = cashflow_table_new_20to21.to_dict()['29/06/2021']
cashflow_table_new_ttm_dict = cashflow_table_new_ttm.to_dict()['ttm']
##### Transform #####

##### Load #####
# Load Into Mongodb
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Load all dictionaries into a single dictionary
stock_cash_flow_reports = {'29/06/2018':cashflow_table_new_17to18_dict,
                         '29/06/2019':cashflow_table_new_18to19_dict,
                         '29/06/2020':cashflow_table_new_19to20_dict,
                         '29/06/2021':cashflow_table_new_20to21_dict,
                         'ttm':cashflow_table_new_ttm_dict}

stock_dict = {}

stock_dict[stock] = stock_cash_flow_reports

# Insert into Mongo db
client.Stocks_db.cash_flow.insert_one(stock_dict)
##### Load #####
###############  CBA   ###############

############################################################

###############  ANZ   ###############
##### Extract #####
# Read the tables in the HTML page
# Scrape the Data
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

stock = 'ANZ'
url = f'https://au.finance.yahoo.com/quote/{stock}.AX/cash-flow?p={stock}.AX'
browser.visit(url)

time.sleep(1)

html = browser.html
soup = bs(html, "html.parser")

# Get table headers
cashflow_table_headers = soup.find_all('div', class_='D(tbr) C($primaryColor)')
# Get table rows
cashflow_table_rows = soup.find_all('div', class_='D(tbr) fi-row Bgc($hoverBgColor):h')

# Quit the browser
browser.quit()

# Get the table headers
table_headers_list = []
for header in cashflow_table_headers:
    for span in header.find_all('span'):
        table_headers_list.append(span.text)

# Get all the table rows
table_rows = []
row_list = []
for rows in cashflow_table_rows:
    for fields in rows:
        row_list.append(fields.text)
    # Add row to table rows
    table_rows.append(row_list)
    # Reset the row list for the next row
    row_list = []
    
# Convert the result into a DataFrame
cashflow_table = pd.DataFrame(table_rows, columns=table_headers_list)
##### Extract #####
##### Transform #####
# Remove all the rows without values based on any of the financial year columns
cashflow_table_new = cashflow_table[cashflow_table['29/09/2020']!=''].copy()
# Replace all fields containing '-' with 0
cashflow_table_new = cashflow_table_new.replace('-',0)
# Set the index to Breakdown
cashflow_table_new = cashflow_table_new.set_index('Breakdown')

# Divide into series by financial year
#2017-2018
cashflow_table_new_17to18 = cashflow_table_new[['29/09/2018']].copy()
#2018-2019
cashflow_table_new_18to19 = cashflow_table_new[['29/09/2019']].copy()
#2019-2020
cashflow_table_new_19to20 = cashflow_table_new[['29/09/2020']].copy()
# TTM
cashflow_table_new_ttm = cashflow_table_new[['ttm']].copy()

cashflow_table_new_17to18_dict = cashflow_table_new_17to18.to_dict()['29/09/2018']
cashflow_table_new_18to19_dict = cashflow_table_new_18to19.to_dict()['29/09/2019']
cashflow_table_new_19to20_dict = cashflow_table_new_19to20.to_dict()['29/09/2020']
cashflow_table_new_ttm_dict = cashflow_table_new_ttm.to_dict()['ttm']

##### Transform #####

##### Load #####
# Load Into Mongodb
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

stock_dict = {}

# Load all dictionaries into a single dictionary
stock_cash_flow_reports = {'29/09/2018':cashflow_table_new_17to18_dict,
                         '29/09/2019':cashflow_table_new_18to19_dict,
                         '29/09/2020':cashflow_table_new_19to20_dict,
                         'ttm':cashflow_table_new_ttm_dict}

stock_dict[stock] = stock_cash_flow_reports

# Insert into MongoDB
client.Stocks_db.cash_flow.insert_one(stock_dict)
##### Load #####
###############  ANZ   ###############