In [1]:
#import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Getting links

In [2]:
#create response object
url = 'https://www.canadianletters.ca/collection-search?field_war_ref_target_id=All&title=&keys=&field_document_type_tid=242&field_partial_date_day=&field_partial_date_month=&field_partial_date_year='
response = requests.get(url)
response.status_code

200

In [3]:
response.url

'https://www.canadianletters.ca/collection-search?field_war_ref_target_id=All&title=&keys=&field_document_type_tid=242&field_partial_date_day=&field_partial_date_month=&field_partial_date_year='

In [4]:
soup = BeautifulSoup(response.content, "lxml")

In [5]:
table = soup.tbody

In [6]:
rows = table.find_all('tr')

In [7]:
links = []
collections = []

In [8]:
rows[0].find_all('td')[0].text.strip()

'Korea'

In [9]:
rows[0].find_all('td')[-1].a['href']

'https://www.canadianletters.ca/document-62269'

In [10]:
for row in rows:
    collections.append(row.find_all('td')[0].text.strip())
    links.append(row.find_all('td')[-1].a['href'])

In [11]:

for pagenum in range(1,99):
    next_page_url = url + '&page=' + str(pagenum)
    response = requests.get(next_page_url)
    if response.ok:
        soup = BeautifulSoup(response.content, "lxml")
        table = soup.tbody
        rows = table.find_all('tr')
        for row in rows:
            collections.append(row.find_all('td')[0].text.strip())
            links.append(row.find_all('td')[-1].a['href'])
    else:
        print('Problem with', next_page_url)
        
        

In [12]:
len(links)

9840

In [13]:
links[0]

'https://www.canadianletters.ca/document-62269'

In [14]:
links[-1]

'https://www.canadianletters.ca/document-68286'

In [15]:
len(set(links))

6454

## **Getting Letters**

In [16]:
response = requests.get(links[0])
response.ok

True

In [17]:
soup = BeautifulSoup(response.content, 'lxml')

In [18]:
h1 = soup.find_all('h1')

### Extracts date/sender/letter

In [19]:
name = h1[1].text.split(':')[0][:-7]

In [20]:
date = h1[1].text.split(':')[1][1:]


In [21]:
text = soup.find('div', class_='panel-pane pane-entity-field pane-node-body').find_all('p')

In [22]:
letter =""
for paragraph in text:
    letter += paragraph.text
    letter += '\n'

In [23]:
names = []
dates = []
letters = []

for link in links:
    response = requests.get(link)
    if response.ok:
        soup = BeautifulSoup(response.content, 'lxml')
        h1 = soup.find_all('h1')
        names.append(h1[1].text.split(':')[0][:-7])
        dates.append(h1[1].text.split(':')[1][1:])
        
        text = soup.find('div', class_='panel-pane pane-entity-field pane-node-body').find_all('p')
        letter = ""
        for paragraph in text:
            letter += paragraph.text
            letter += '\n'
        letters.append(letter)
        
dict = {'Name':names, 'Collection': collections, 'Date':dates, 'Letter':letters, 'Link':links}

In [27]:
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,Name,Collection,Date,Letter,Link
0,"McKenzie, Edward Henry",Korea,1951 April 22nd,Dear Mom:\nI received your bottle of cough med...,https://www.canadianletters.ca/document-62269
1,"McKenzie, Edward Henry",Korea,1951 April 1st,April 1/51\nDear Mom:\nThought it was about ti...,https://www.canadianletters.ca/document-62270
2,"McKenzie, Edward Henry",Korea,1951 March 10th,UNITED STATES ARMYSPECIAL SERVICES\nMarch 10/5...,https://www.canadianletters.ca/document-62271
3,"McKenzie, Edward Henry",Korea,1951 February 28th,Feb 28/51\nDear Mom:\nStill going strong and f...,https://www.canadianletters.ca/document-62272
4,"McKenzie, Edward Henry",Korea,1951 January 31st,Jan 31/51\nDear Mom:\nReceived your letter of ...,https://www.canadianletters.ca/document-62273


In [25]:
df.to_csv('letters-raw.csv', index=False)

In [26]:
links

['https://www.canadianletters.ca/document-62269',
 'https://www.canadianletters.ca/document-62270',
 'https://www.canadianletters.ca/document-62271',
 'https://www.canadianletters.ca/document-62272',
 'https://www.canadianletters.ca/document-62273',
 'https://www.canadianletters.ca/document-62274',
 'https://www.canadianletters.ca/document-62275',
 'https://www.canadianletters.ca/document-62276',
 'https://www.canadianletters.ca/document-62277',
 'https://www.canadianletters.ca/document-62278',
 'https://www.canadianletters.ca/document-62241',
 'https://www.canadianletters.ca/document-62279',
 'https://www.canadianletters.ca/document-62242',
 'https://www.canadianletters.ca/document-62280',
 'https://www.canadianletters.ca/document-62243',
 'https://www.canadianletters.ca/document-62281',
 'https://www.canadianletters.ca/document-62244',
 'https://www.canadianletters.ca/document-62282',
 'https://www.canadianletters.ca/document-62268',
 'https://www.canadianletters.ca/document-3612',
 