## Scraping Wealthsimple Dividends with Scrapy

In [None]:
# todo: import the scrapy module
from scrapy import Selector
from datetime import datetime
import pandas as pd

    - get the html data first using the javascript code in the info file.

In [None]:
# use with open() as f: to open the html file
with open('wealthsimple_scraped.html', 'r') as f:
    html = f.read()

sel = Selector(text=html)    

In [None]:
# div_xpath = '//*[contains(@class,"kFJHYd")]'

symbol_xpath = '//*[@class="sc-204449cf-0 jUrmPc"]/text()'
date_xpath = '//*[@class="sc-204449cf-0 bIIoGo"]/text()'
payout_xpath = '//*[@class="sc-204449cf-0 bgBEGg"]/text()'

In [None]:
#  Using xpath to extract the data
print(sel.xpath(symbol_xpath).extract_first())
print(sel.xpath(date_xpath).extract_first())
print(sel.xpath(payout_xpath).extract_first())

In [None]:
print(sel.xpath(symbol_xpath).extract())
print(sel.xpath(date_xpath).extract())
print(sel.xpath(payout_xpath).extract())

In [None]:
#  create a function that takes the date list and returns a list of datetime objects
def convert_to_datetime(date_list):
    account_list = []
    new_date_list = []
    for item in date_list:
        splice = item.split(sep='|')
        date_string = splice[1].strip()
        account_string = splice[0].strip()
        date_object = datetime.strptime(date_string, '%b %d, %Y')
        date_object = date_object.date()
        new_date_list.append(date_object)
        account_list.append(account_string)
    return account_list, new_date_list

In [None]:
symbol_list = sel.xpath(symbol_xpath).extract()
date_list = sel.xpath(date_xpath).extract()
account_list, date_list = convert_to_datetime(date_list)
payout_list = sel.xpath(payout_xpath).extract()

# zip the lists together
dividends_paid_list = list(zip(symbol_list, account_list, date_list, payout_list))

In [None]:
#  convert to dataframe
df = pd.DataFrame(dividends_paid_list, columns=['symbol', 'type', 'date', 'payout'])
# put the date column into this format: Aug 17, 2023
df['date'] = pd.to_datetime(df['date']).dt.strftime('%b %d, %Y')
# create a column for the month
df['month'] = pd.to_datetime(df['date']).dt.strftime('%b')
# create a column for the year
df['year'] = pd.to_datetime(df['date']).dt.strftime('%Y')
#  move the payout column to the end
cols = list(df.columns.values)
cols.pop(cols.index('payout'))
df = df[cols+['payout']]
df['payout'] = df['payout'].str.replace(' USD', '')

df

In [None]:
#  save the dataframe to a csv file
df.to_csv('wealthsimple_dividends.csv', index=False)

## Create a new dataframe with only symbols and payout so I can add them up and append to the UFX Dashboard  

In [99]:
# read the wealthsimple_dividends.csv file
wealthsimple_df = pd.read_csv('wealthsimple_dividends.csv')

In [100]:
# keep only sumbol and payout columns
wealthsimple_df = wealthsimple_df[['type', 'symbol', 'payout']]

# get rid of $ and convert the payout column to float
wealthsimple_df['payout'] = wealthsimple_df['payout'].str.replace('$', '', regex=False)
wealthsimple_df['payout'] = wealthsimple_df['payout'].astype(float)

# sort the dataframe by symbol
wealthsimple_df = wealthsimple_df.sort_values(by=['symbol'])
# reset the index
wealthsimple_df = wealthsimple_df.reset_index(drop=True)

wealthsimple_df

Unnamed: 0,type,symbol,payout
0,TFSA,ARR,0.64
1,RRSP,ARR,1.12
2,TFSA,AVK,0.82
3,TFSA,CIF,0.85
4,TFSA,CIM,1.26
5,TFSA,CLM,1.23
6,RRSP,CLM,0.86
7,TFSA,DX,0.91
8,TFSA,EHI,0.94
9,RRSP,EMD,0.76


In [107]:
# read in dividend dashboard from one drive and keep all data even duplicates

dividend_dashboard_df = pd.read_csv('google_sheets.csv')
# change Nan to 0 in the Div. Earned column 
dividend_dashboard_df['Div. Earned'] = dividend_dashboard_df['Div. Earned'].fillna(0)


In [108]:
# replace the $ with nothing in the Div. Earned column and convert to float
dividend_dashboard_df['Div. Earned'] = dividend_dashboard_df['Div. Earned'].str.replace('$', '', regex=False)
# fill the Nan with 0
dividend_dashboard_df['Div. Earned'] = dividend_dashboard_df['Div. Earned'].fillna(0)
# convert the Div. Earned column to float
dividend_dashboard_df['Div. Earned'] = dividend_dashboard_df['Div. Earned'].astype(float)
dividend_dashboard_df

Unnamed: 0,type,Ticker,Div. Earned
0,TFSA,ARR,20.3
1,RRSP,ARR,30.38
2,TFSA,AVK,13.12
3,TFSA,CIF,1.73
4,TFSA,CIM,18.08
5,TFSA,CLM,24.66
6,RRSP,CLM,23.52
7,TFSA,DX,9.1
8,TFSA,EHI,3.76
9,RRSP,EMD,14.78


In [109]:
# Merge the dataframes on 'symbol' and 'Ticker'
# Merge the dataframes on 'Ticker' and 'symbol', as well as 'type'
merged_df = dividend_dashboard_df.merge(wealthsimple_df, left_on=['Ticker', 'type'], right_on=['symbol', 'type'], how='left')

merged_df


Unnamed: 0,type,Ticker,Div. Earned,symbol,payout
0,TFSA,ARR,20.3,ARR,0.64
1,RRSP,ARR,30.38,ARR,1.12
2,TFSA,AVK,13.12,AVK,0.82
3,TFSA,CIF,1.73,CIF,0.85
4,TFSA,CIM,18.08,CIM,1.26
5,TFSA,CLM,24.66,CLM,1.23
6,RRSP,CLM,23.52,CLM,0.86
7,TFSA,DX,9.1,DX,0.91
8,TFSA,EHI,3.76,EHI,0.94
9,RRSP,EMD,14.78,EMD,0.76


In [111]:
# change Nan to 0 in the payout column
merged_df['payout'] = merged_df['payout'].fillna(0)

# create a new column called 'Total Dividends'
merged_df['Total Dividends'] = merged_df['Div. Earned'] + merged_df['payout']
merged_df

In [112]:
merged_df

Unnamed: 0,type,Ticker,Div. Earned,symbol,payout,Total Dividends
0,TFSA,ARR,20.3,ARR,0.64,20.94
1,RRSP,ARR,30.38,ARR,1.12,31.5
2,TFSA,AVK,13.12,AVK,0.82,13.94
3,TFSA,CIF,1.73,CIF,0.85,2.58
4,TFSA,CIM,18.08,CIM,1.26,19.34
5,TFSA,CLM,24.66,CLM,1.23,25.89
6,RRSP,CLM,23.52,CLM,0.86,24.38
7,TFSA,DX,9.1,DX,0.91,10.01
8,TFSA,EHI,3.76,EHI,0.94,4.7
9,RRSP,EMD,14.78,EMD,0.76,15.54


In [None]:
# save to dividends_tallied.csv
merged_df.to_csv('dividends_tallied.csv', index=False)