# File download and extraction for NLP input
## This code will download all the Monetary Policy and Financial Stability reports between 1999 and 2022
## Full and summary text is extracted from these and output in csv format for use in NLP

In [4]:
# Import libraries
!pip install PyPDF2
!pip install requests
!pip install tqdm
!pip install pymupdf

import os
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
from PyPDF2 import PdfReader
from tqdm import tqdm
import fitz



In [5]:
# Load the file containing dates of the Monetary Policy
# Inflation and Financial stability report
report_dates = pd.read_csv('report_dates.csv')

# View the dataframe
report_dates

Unnamed: 0,FSR,MPR_IR,MPR,IR
0,06/12/2023,01/02/2024,01/02/2024,01/08/2019
1,12/07/2023,02/11/2023,02/11/2023,02/05/2019
2,13/12/2022,03/08/2023,03/08/2023,07/02/2019
3,05/07/2022,11/05/2023,11/05/2023,01/11/2018
4,13/12/2021,02/02/2023,02/02/2023,02/08/2018
...,...,...,...,...
120,,01/02/1994,,
121,,01/11/1993,,
122,,01/08/1993,,
123,,01/05/1993,,


# Monetary Policy/Inflation reports

## File download

In [6]:
# Define a function to use in downloading the pdf files
#
# The Bank of England has changed the location and filing name of these
# several times over the years and in once case, named the file under a different month.
#
# This function allows specification of the star and end dates, the save path, report name
# as well as how many times a file download should be attempted.
# Multiple locations will be tried and response codes handled appropriately

def download_pdf(date, start_date_str, end_date_str, save_path, report_name, max_retries=1, retry_delay=5):
    start_date = datetime.strptime(start_date_str, '%d/%m/%Y')
    end_date = datetime.strptime(end_date_str, '%d/%m/%Y')
    
    # Added this in to prevent server issues - I'd found that I'd get 200 response codes
    # but downloading in a browser worked fine. This mimics a browser
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    })
    
    # Iterate through each date in the provided list
    for date_str in date:
        report_date = datetime.strptime(date_str, '%d/%m/%Y')
        
        # Check if the report date falls within the specified start and end dates
        if report_date < start_date or report_date > end_date:
            continue
        
        # Extract year and month strings from the report date
        year = report_date.year
        month_str = report_date.strftime('%B').lower()
        short_month_str = report_date.strftime('%b').lower()
        
        # Construct the file path for saving the downloaded PDF
        file_path = os.path.join(save_path, f"{report_name}-{month_str}-{year}.pdf")
        
        # Check if the report for the current month and year already exists
        if os.path.exists(file_path):
            print(f"Report for {month_str.capitalize()} {year} already exists. Skipping...")
            continue
            
        # Define possible URL formats to try for downloading the PDF
        url_formats = [
            f"https://www.bankofengland.co.uk/-/media/boe/files/{report_name}/{year}/{month_str}-{year}.pdf",
            f"https://www.bankofengland.co.uk/-/media/boe/files/{report_name}/{year}/{month_str}/{report_name}-{month_str}-{year}.pdf",
            f"https://www.bankofengland.co.uk/-/media/boe/files/{report_name}/{year}/{month_str}/{report_name}-{short_month_str}-{year}.pdf",
            f"https://www.bankofengland.co.uk/-/media/boe/files/{report_name}/{year}/{report_name}-{month_str}-{year}.pdf",
            f"https://www.bankofengland.co.uk/-/media/boe/files/{report_name}/{year}/{short_month_str}.pdf",
            f"https://www.bankofengland.co.uk/-/media/boe/files/{report_name}/{year}/{short_month_str}-{year}.pdf"
        ]
        
        # Iterate through each URL format and attempt to download the PDF
        for url in url_formats:
            success = False
            for retry_count in range(max_retries + 1):  # Include initial attempt in the count
                print(f"Attempting to download from {url}")
                response = session.get(url)
                
                # Check the response status code
                if response.status_code == 200:
                    # Create directories if they don't exist and save the PDF
                    os.makedirs(save_path, exist_ok=True)
                    with open(file_path, "wb") as f:
                        f.write(response.content)
                    print(f"Downloaded {month_str.capitalize()} {year} report.")
                    success = True
                    break  # Exit the retry loop on success
                # Handle different response status codes    
                elif response.status_code == 404:
                    # File not found, no need to retry, move to the next URL format
                    print(f"File not found (404) at {url}. Trying next URL format...")
                    break
                elif response.status_code == 500:
                    # Server error, retrying after a delay, up to the retry limit
                    if retry_count < max_retries:
                        print(f"Server error (500). Retrying after {retry_delay} seconds...")
                        time.sleep(retry_delay)
                    else:
                        print(f"Server error (500), but max retries reached. Moving to next URL format...")
                        break
                else:
                    # For any other error, log and move to the next URL without retrying
                    print(f"Failed with response code {response.status_code}. Moving to next URL format...")
                    break
            
            if success:
                break  # Successfully downloaded, no need to try more URLs for this date
                
        if not success:
            print(f"Failed to download report for {month_str.capitalize()} {year} after trying all URL formats.")


## File download
## Warning: if executed, these sections will download >360Mb of .pdf files in total
## Exisiting files will be skipped.

In [4]:
# Between the start of our period of concern in 1999 and may 2015
# Inflation reports had a summary period that started "Overivew"
# Download these files using the function above between these dates

# Get the dates of the inflation reports to use in iteration
ir_dates = report_dates['IR'].dropna().unique()

# Download the files
for date in ir_dates:
    download_pdf([date], '01/02/1999', '31/05/2015', 'inflation_reports_02_99-05_15','inflation-report')

Report for May 2015 already exists. Skipping...
Report for February 2015 already exists. Skipping...
Report for November 2014 already exists. Skipping...
Report for August 2014 already exists. Skipping...
Report for May 2014 already exists. Skipping...
Report for February 2014 already exists. Skipping...
Report for November 2013 already exists. Skipping...
Report for August 2013 already exists. Skipping...
Report for May 2013 already exists. Skipping...
Report for February 2013 already exists. Skipping...
Report for November 2012 already exists. Skipping...
Report for August 2012 already exists. Skipping...
Report for May 2012 already exists. Skipping...
Report for February 2012 already exists. Skipping...
Report for November 2011 already exists. Skipping...
Report for August 2011 already exists. Skipping...
Report for May 2011 already exists. Skipping...
Report for February 2011 already exists. Skipping...
Report for November 2010 already exists. Skipping...
Report for August 2010 alr

In [5]:
# Between the may 2015 and the end of the publication of the inflation report
# The summary section was indicated by "Monetary Policy Summary"

# Download these files using the function above between these dates
for date in ir_dates:
    download_pdf([date], '01/08/2015', '31/08/2019', 'inflation_reports_08_15-08_19','inflation-report')

Report for August 2019 already exists. Skipping...
Report for May 2019 already exists. Skipping...
Report for February 2019 already exists. Skipping...
Report for November 2018 already exists. Skipping...
Report for August 2018 already exists. Skipping...
Report for May 2018 already exists. Skipping...
Report for February 2018 already exists. Skipping...
Report for November 2017 already exists. Skipping...
Report for August 2017 already exists. Skipping...
Report for May 2017 already exists. Skipping...
Report for February 2017 already exists. Skipping...
Report for November 2016 already exists. Skipping...
Report for August 2016 already exists. Skipping...
Report for May 2016 already exists. Skipping...
Report for February 2016 already exists. Skipping...
Report for November 2015 already exists. Skipping...
Report for August 2015 already exists. Skipping...


In [6]:
# Get the dates of the inflation reports to use in iteration
mpr_dates = report_dates['MPR'].dropna().unique()

# Download all the MPR reports between the start in 11/19 to the
# end of our period of concern
for date in mpr_dates:
     download_pdf([date], '01/11/2019', '30/11/2022', 'monetary_policy_reports_11_19-11_22','monetary-policy-report')

Report for November 2022 already exists. Skipping...
Report for August 2022 already exists. Skipping...
Report for May 2022 already exists. Skipping...
Report for February 2022 already exists. Skipping...
Report for November 2021 already exists. Skipping...
Report for August 2021 already exists. Skipping...
Report for May 2021 already exists. Skipping...
Report for February 2021 already exists. Skipping...
Report for November 2020 already exists. Skipping...
Report for August 2020 already exists. Skipping...
Report for May 2020 already exists. Skipping...
Report for January 2020 already exists. Skipping...
Report for November 2019 already exists. Skipping...


## Text extraction

In [7]:
# Define a function to use PdfReader from PyPDF2
# to extract the summary between a start phrase
# and the start of the main body, indicated by "1" 

def summary_pypdf(pdf_file_path, start_text):
    with open(pdf_file_path, 'rb') as file:
        # Create a PdfReader object to read the PDF file
        reader = PdfReader(file)
        
        # Get the total number of pages in the PDF
        num_pages = len(reader.pages)
        
        # Initialize variables to store summary text and track if summary is found
        summary_text = ""
        found_summary = False
        
        # Iterate through each page of the PDF
        for page_num in range(num_pages):
            # Extract text from the current page
            text = reader.pages[page_num].extract_text()
            
            # Skip pages that contain "Contents" (e.g., table of contents)
            if "Contents" in text:
                continue
            
            # Split the text into lines
            lines = text.split('\n')
            
            # Iterate through each line of text on the page
            for line in lines:
                # Check if the line contains the start_text and if it is the exact start_text
                if start_text.lower() in line.lower() and line.strip().lower() == start_text.lower():
                    # Indicates that the summary text has been found
                    found_summary = True
                    continue
                
                # Check if the summary text has been found and the line starts with "1"
                
                # Return the summary text (stripped of leading/trailing whitespace) if the line starts with "1"
                if found_summary and line.strip().startswith("1"):
                    return summary_text.strip()
                
                # If the summary text has been found, append the line to the summary_text variable
                if found_summary:
                    summary_text += line + ' '
        
        # Return the summary text (
        return summary_text.strip()


In [8]:
# Define a function to use fitz from PyMuPDF
# to extract the summary between a start phrase
# and the start of the main body, indicated by "1" 

def summary_pymupdf(pdf_file_path, start_text):
    # Open the PDF file with PyMuPDF
    doc = fitz.open(pdf_file_path)
    
    # Initialize variables to store summary text and track if summary is found
    summary_text = ""
    found_summary = False

    # Iterate through each page in the document
    for page in doc:
        # Extract text from the current page
        text = page.get_text()
        
        # Skip pages that contain "Contents" (e.g., table of contents)
        if "Contents" in text:
            continue

        # Split the text into lines
        lines = text.split('\n')
        
        # Iterate through each line of text on the page
        for line in lines:
            # Check if the line contains the start_text and if it is the exact start_text
            if start_text.lower() in line.lower() and line.strip().lower() == start_text.lower():
                # Indicates that the summary text has been found
                found_summary = True
                continue

            # Check if the summary text has been found and the line starts with "1"
            if found_summary and line.strip().startswith("1"):
                doc.close()  # Close the document before returning
                return summary_text.strip()

            # If the summary text has been found, append the line to the summary_text variable
            if found_summary:
                summary_text += line + ' '

    # Close the document after processing
    doc.close()
    
    # Return the summary text (stripped of leading/trailing whitespace)
    return summary_text.strip()


In [9]:
# Define a function to extract the text and save it,
# calling either of the fucntions above
# As this can be a long process, use a progress bar

def extract_summary(folder_path, start_text):
    # Initialize an empty list to store extracted data
    data = []
    
    # List all PDF files in the specified folder
    pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]

    # Wrap the loop with tqdm for a progress bar
    for file_name in tqdm(pdf_files, desc="Processing PDFs"):
        # Split the file name to extract month and year information
        parts = file_name.split('-')
        # Extract month and year based on their positions from the end
        month = parts[-2].capitalize()  # Month is 2nd to last
        year = parts[-1].split('.')[0]  # Year is last
        date = f"{month} {year}"
        
        # Extract summary text using the summary_pymupdf function
        summary_text = summary_pymupdf(os.path.join(folder_path, file_name), start_text)
        
        # Alternatively, you can use the summary_pypdf function by uncommenting the following line
        # and commenting out the one above:
        # summary_text = summary_pypdf(os.path.join(folder_path, file_name), start_text)
        
        # Append date and summary text to the data list as a dictionary
        data.append({'Date': date, 'Summary': summary_text})

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(f'{folder_path}_summary.csv', index=False)
    
    # Print a message indicating that the DataFrame has been saved to a CSV file
    print(f"DataFrame saved to csv")


In [10]:
# Call the fucntion with the specified dates and start phrase for the MPS
extract_summary('monetary_policy_reports_11_19-11_22', 'monetary policy summary')

Processing PDFs: 100%|██████████| 13/13 [00:00<00:00, 17.66it/s]

DataFrame saved to csv





In [11]:
# Call the fucntion with the specified dates and start phrase for the later 
# Inflation reports
extract_summary('inflation_reports_08_15-08_19', 'monetary policy summary')

Processing PDFs: 100%|██████████| 17/17 [00:00<00:00, 42.78it/s]

DataFrame saved to csv





In [12]:
# Call the fucntion with the specified dates and start phrase for the earlier
# Inflation reports

extract_summary('inflation_reports_02_99-05_15', 'overview')

Processing PDFs: 100%|██████████| 66/66 [00:00<00:00, 74.29it/s]

DataFrame saved to csv





In [13]:
# Define a function to use fitz from PyMuPDF
# to extrtact the entire text of a provided file

def full_pymupdf(pdf_file_path):
    # Open the PDF file with PyMuPDF
    doc = fitz.open(pdf_file_path)
    
    # Initialize a variable to store the entire text of the PDF
    entire_text = ""

    # Iterate through each page in the document
    for page in doc:
        # Extract text from the current page and append it to the entire_text variable
        entire_text += page.get_text() + '\n'  # Adds a new line
        # If you want to concatenate text without new lines, you can use the following line instead:
        # entire_text += page.get_text() 
        
    # Close the document after processing
    doc.close()
    
    # Split the entire text into words based on whitespace
    words_list = entire_text.split()
    
    # Return the entire text of the PDF (stripped of leading/trailing whitespace)
    return entire_text.strip()

In [14]:
# Define a function to extract the text and save it,
# calling the fucntion above
# As this can be a long process, use a progress bar

def extract_full(folder_path):
    # Initialize an empty list to store extracted data
    data = []
    
    # List all PDF files in the specified folder
    pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]

    # Wrap the loop with tqdm for a progress bar
    for file_name in tqdm(pdf_files, desc="Processing PDFs"):
        # Split the file name to extract month and year information
        parts = file_name.split('-')
        # Extract month and year based on their positions from the end
        month = parts[-2].capitalize()  # Month is 2nd to last
        year = parts[-1].split('.')[0]  # Year is last
        date = f"{month} {year}"
        
        # Extract full text of the PDF using the full_pymupdf function
        full_text = full_pymupdf(os.path.join(folder_path, file_name))
        
        # Append date and full text to the data list as a dictionary
        data.append({'Date': date, 'Full_Text': full_text})

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(f'{folder_path}_full.csv', index=False)
    # Print a message indicating that the DataFrame has been saved to a CSV file
    print(f"DataFrame saved to csv")


In [15]:
# Call the fucntion with the specified dates and start phrase for the MPS
extract_full('monetary_policy_reports_11_19-11_22')

Processing PDFs: 100%|██████████| 13/13 [00:02<00:00,  5.51it/s]

DataFrame saved to csv





In [16]:
# Call the fucntion with the specified dates and start phrase for the earlier
# Inflation reports
extract_full('inflation_reports_08_15-08_19')

Processing PDFs: 100%|██████████| 17/17 [00:03<00:00,  5.66it/s]


DataFrame saved to csv


In [17]:
# Call the fucntion with the specified dates and start phrase for the earlier
# Inflation reports
extract_full('inflation_reports_02_99-05_15')

Processing PDFs: 100%|██████████| 66/66 [00:08<00:00,  8.12it/s]


DataFrame saved to csv


## Clean up and export

In [18]:
# Read the output files into dataframes

ir1 = pd.read_csv('inflation_reports_02_99-05_15_full.csv')
ir2 = pd.read_csv('inflation_reports_08_15-08_19_full.csv')
mpr = pd.read_csv('monetary_policy_reports_11_19-11_22_full.csv')

# Concatenate the dataframes
monetary_policy_reports = pd.concat([ir1, ir2, mpr], ignore_index=True)

# Display the dataframe
monetary_policy_reports

Unnamed: 0,Date,Full_Text
0,August 1999,Inflation Report\nAugust 1999\nThe Inflation R...
1,August 2000,Inflation Report\nAugust 2000\nThe Inflation R...
2,August 2001,Inflation Report\nAugust 2001\nThe Inflation R...
3,August 2002,Inflation Report\nAugust 2002\nThe Inflation R...
4,August 2003,Inflation Report\nAugust 2003\nThe Inflation R...
...,...,...
91,May 2022,Bank of England\nMonetary Policy Report \nMone...
92,November 2019,Monetary Policy Report\nNovember 2019\nMonetar...
93,November 2020,Monetary Policy Committee\nMonetary Policy Rep...
94,November 2021,Monetary Policy Committee\nMonetary Policy Rep...


In [19]:
# Get the original dates
mpr_dates = report_dates['MPR_IR'].dropna().unique()
mpr_dates

array(['01/02/2024', '02/11/2023', '03/08/2023', '11/05/2023',
       '02/02/2023', '03/11/2022', '04/08/2022', '05/05/2022',
       '03/02/2022', '04/11/2021', '05/08/2021', '06/05/2021',
       '04/02/2021', '05/11/2020', '06/08/2020', '07/05/2020',
       '30/01/2020', '07/11/2019', '01/08/2019', '02/05/2019',
       '07/02/2019', '01/11/2018', '02/08/2018', '10/05/2018',
       '08/02/2018', '02/11/2017', '03/08/2017', '11/05/2017',
       '02/02/2017', '03/11/2016', '04/08/2016', '12/05/2016',
       '04/02/2016', '05/11/2015', '06/08/2015', '13/05/2015',
       '12/02/2015', '12/11/2014', '13/08/2014', '14/05/2014',
       '12/02/2014', '13/11/2013', '07/08/2013', '15/05/2013',
       '13/02/2013', '14/11/2012', '08/08/2012', '16/05/2012',
       '15/02/2012', '16/11/2011', '10/08/2011', '11/05/2011',
       '16/02/2011', '10/11/2010', '11/08/2010', '12/05/2010',
       '10/02/2010', '11/11/2009', '12/08/2009', '13/05/2009',
       '11/02/2009', '12/11/2008', '13/08/2008', '14/05

In [20]:
#Convert mpr_dates to datetime for easier manipulation
mpr_dates_dt = pd.to_datetime(mpr_dates, format='%d/%m/%Y')

# Create a mapping from month-year to the original date
# Ceate a month-year representation of mpr_dates
month_year = mpr_dates_dt.strftime('%B %Y')
# Create a dictionary where key is 'Month Year' and value is the original date (as string for direct replacement)
date_mapping = dict(zip(month_year, mpr_dates))

# Replace the dates in monetary_policy_reports using the mapping
monetary_policy_reports['Date'] = monetary_policy_reports['Date'].map(date_mapping)

# Display the dataframe
monetary_policy_reports


Unnamed: 0,Date,Full_Text
0,11/08/1999,Inflation Report\nAugust 1999\nThe Inflation R...
1,09/08/2000,Inflation Report\nAugust 2000\nThe Inflation R...
2,08/08/2001,Inflation Report\nAugust 2001\nThe Inflation R...
3,07/08/2002,Inflation Report\nAugust 2002\nThe Inflation R...
4,13/08/2003,Inflation Report\nAugust 2003\nThe Inflation R...
...,...,...
91,05/05/2022,Bank of England\nMonetary Policy Report \nMone...
92,07/11/2019,Monetary Policy Report\nNovember 2019\nMonetar...
93,05/11/2020,Monetary Policy Committee\nMonetary Policy Rep...
94,04/11/2021,Monetary Policy Committee\nMonetary Policy Rep...


In [21]:
# remove newline characters "\n"
monetary_policy_reports['Full_Text'] = monetary_policy_reports['Full_Text'].str.replace('\n', ' ')

# Display the DataFrame
monetary_policy_reports

Unnamed: 0,Date,Full_Text
0,11/08/1999,Inflation Report August 1999 The Inflation Rep...
1,09/08/2000,Inflation Report August 2000 The Inflation Rep...
2,08/08/2001,Inflation Report August 2001 The Inflation Rep...
3,07/08/2002,Inflation Report August 2002 The Inflation Rep...
4,13/08/2003,Inflation Report August 2003 The Inflation Rep...
...,...,...
91,05/05/2022,Bank of England Monetary Policy Report Moneta...
92,07/11/2019,Monetary Policy Report November 2019 Monetary ...
93,05/11/2020,Monetary Policy Committee Monetary Policy Repo...
94,04/11/2021,Monetary Policy Committee Monetary Policy Repo...


In [22]:
# Sort the file by date
# Convert the 'Date' column to datetime format for sorting
monetary_policy_reports['Date'] = pd.to_datetime(monetary_policy_reports['Date'], format='%d/%m/%Y')

# Sort the DataFrame by the 'Date' column in place
monetary_policy_reports.sort_values(by='Date', inplace=True)

# Convert the 'Date' column back to the desired string format
monetary_policy_reports['Date'] = monetary_policy_reports['Date'].dt.strftime('%d/%m/%Y')

# Reset the index of the DataFrame in place
monetary_policy_reports.reset_index(drop=True, inplace=True)

# Display the DataFrame
monetary_policy_reports

Unnamed: 0,Date,Full_Text
0,10/02/1999,Inflation Report February 1999 The Inflation R...
1,12/05/1999,Inflation Report May 1999 The Inflation Report...
2,11/08/1999,Inflation Report August 1999 The Inflation Rep...
3,10/11/1999,Inflation Report November 1999 The Inflation R...
4,17/02/2000,Inflation Report February 2000 The Inflation R...
...,...,...
91,04/11/2021,Monetary Policy Committee Monetary Policy Repo...
92,03/02/2022,Monetary Policy Committee Monetary Policy Repo...
93,05/05/2022,Bank of England Monetary Policy Report Moneta...
94,04/08/2022,Bank of England Monetary Policy Report Moneta...


In [23]:
# Save the file
monetary_policy_reports.to_csv('monetary_policy_reports.csv', index=False)

In [24]:
# Condensed repeat of the above

# Read the files
ir1 = pd.read_csv('inflation_reports_02_99-05_15_summary.csv')
ir2 = pd.read_csv('inflation_reports_08_15-08_19_summary.csv')
mpr = pd.read_csv('monetary_policy_reports_11_19-11_22_summary.csv')
# Join the files
monetary_policy_summary = pd.concat([ir1, ir2, mpr], ignore_index=True)

# Replace the dates in monetary_policy_summary using the mapping
monetary_policy_summary['Date'] = monetary_policy_summary['Date'].map(date_mapping)


# remove newline characters "\n"
monetary_policy_summary['Summary'] = monetary_policy_summary['Summary'].str.replace('\n', ' ')

# Sort the file by date
# Convert the 'Date' column to datetime format for sorting
monetary_policy_summary['Date'] = pd.to_datetime(monetary_policy_summary['Date'], format='%d/%m/%Y')

# Sort the DataFrame by the 'Date' column in place
monetary_policy_summary.sort_values(by='Date', inplace=True)

# Convert the 'Date' column back to the desired string format
monetary_policy_summary['Date'] = monetary_policy_summary['Date'].dt.strftime('%d/%m/%Y')

# Reset the index of the DataFrame in place
monetary_policy_summary.reset_index(drop=True, inplace=True)

# Display the DataFrame
monetary_policy_summary

# Save the file
monetary_policy_summary.to_csv('monetary_policy_summary.csv', index=False)

# FINANCIAL STABILITY
## Same process as above, but no summary

In [25]:
# Get the dates of the inflation reports to use in iteration
fsr_dates = report_dates['FSR'].dropna().unique()
fsr_dates

array(['06/12/2023', '12/07/2023', '13/12/2022', '05/07/2022',
       '13/12/2021', '13/07/2021', '11/12/2020', '06/08/2020',
       '07/05/2020', '16/12/2019', '11/07/2019', '28/11/2018',
       '27/06/2018', '28/11/2017', '27/06/2017', '30/11/2016',
       '05/07/2016', '01/12/2015', '01/07/2015', '16/12/2014',
       '26/06/2014', '28/11/2013', '26/06/2013', '29/11/2012',
       '29/06/2012', '01/12/2011', '24/06/2011', '17/12/2010',
       '25/06/2010', '18/12/2009', '26/06/2009', '28/10/2008',
       '01/05/2008', '25/10/2007', '26/04/2007', '12/07/2006',
       '13/12/2005', '22/06/2005', '13/12/2004', '28/06/2004',
       '11/12/2003', '26/06/2003', '12/12/2002', '27/06/2002',
       '13/12/2001', '28/06/2001', '14/12/2000', '29/06/2000',
       '26/11/1999', '18/06/1999', '16/11/1998', '18/05/1998',
       '16/10/1997', '03/03/1997', '31/10/1996'], dtype=object)

In [7]:
# The FSR Report for July 2006, is actually named June 2006
# Download this one before automatically doing the rest

# Pretend to be a browser, otherwise download fails

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
})

In [8]:
# Set the URL for the 12th July report named as June
url = 'https://www.bankofengland.co.uk/-/media/boe/files/financial-stability-report/2006/june-2006.pdf'

folder_name = 'financial_stability_reports'
max_retries=1
retry_delay=5
retry_count = 0

# Create the folder if it doesn't exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Save the file as the correct month in the specified folder
file_name = os.path.join(folder_name, 'financial-stability-report-july-2006.pdf')

# Check if the file already exists in the folder
if os.path.exists(file_name):
    print('File already exists. Skipping download.')
else:
    while True:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Open a file in binary write mode with the desired file name to save the PDF content
            with open(file_name, 'wb') as f:
                # Write the content of the response to the file
                f.write(response.content)
            print('File downloaded successfully.')
            break
        elif response.status_code == 500:
            # Server error, retrying after a delay, up to the retry limit
            if retry_count < max_retries:
                print(f"Server error (500). Retrying after {retry_delay} seconds...")
                time.sleep(retry_delay)
                retry_count += 1
            else:
                print(f"Server error (500), but max retries reached. Moving to next URL format...")
                break
        else:
            print(f'Failed to download the file. Status code: {response.status_code}')
            break

File downloaded successfully.


In [28]:
# Use the same function as above, download all the FSR reports
for date in fsr_dates:
     download_pdf([date], '01/06/1999', '31/12/2022', 'financial_stability_reports','financial-stability-report')

Report for December 2022 already exists. Skipping...
Report for July 2022 already exists. Skipping...
Report for December 2021 already exists. Skipping...
Report for July 2021 already exists. Skipping...
Report for December 2020 already exists. Skipping...
Report for August 2020 already exists. Skipping...
Report for May 2020 already exists. Skipping...
Report for December 2019 already exists. Skipping...
Report for July 2019 already exists. Skipping...
Report for November 2018 already exists. Skipping...
Report for June 2018 already exists. Skipping...
Report for November 2017 already exists. Skipping...
Report for June 2017 already exists. Skipping...
Report for November 2016 already exists. Skipping...
Report for July 2016 already exists. Skipping...
Report for December 2015 already exists. Skipping...
Report for July 2015 already exists. Skipping...
Report for December 2014 already exists. Skipping...
Report for June 2014 already exists. Skipping...
Report for November 2013 already

In [29]:
# Extract the full text
extract_full('financial_stability_reports')

Processing PDFs: 100%|██████████| 48/48 [00:11<00:00,  4.17it/s]


DataFrame saved to csv


In [30]:
# Load the csv back in and check
fsr = pd.read_csv('financial_stability_reports_full.csv')
fsr

Unnamed: 0,Date,Full_Text
0,April 2007,Financial Stability Report\nApril 2007 | Issue...
1,August 2020,Financial Policy Committee\nFinancial Stabilit...
2,December 2000,Financial Stability Review\nDecember 2000\nBan...
3,December 2001,Financial Stability Review\nDecember 2001\nBan...
4,December 2002,Financial Stability Review\nDecember 2002\nBan...
5,December 2003,Financial Stability Review\nDecember 2003\nBan...
6,December 2004,Financial Stability Review\nDecember 2004\nBan...
7,December 2005,Financial Stability Review\nDecember 2005\nBan...
8,December 2009,Financial Stability Report\nDecember 2009 | Is...
9,December 2010,Financial Stability Report\nDecember 2010 | Is...


In [31]:
# remove new line character \n
fsr['Full_Text'] = fsr['Full_Text'].str.replace('\n', ' ')

# Display the DataFrame after removing newline characters
fsr

Unnamed: 0,Date,Full_Text
0,April 2007,Financial Stability Report April 2007 | Issue ...
1,August 2020,Financial Policy Committee Financial Stability...
2,December 2000,Financial Stability Review December 2000 Bank ...
3,December 2001,Financial Stability Review December 2001 Bank ...
4,December 2002,Financial Stability Review December 2002 Bank ...
5,December 2003,Financial Stability Review December 2003 Bank ...
6,December 2004,Financial Stability Review December 2004 Bank ...
7,December 2005,Financial Stability Review December 2005 Bank ...
8,December 2009,Financial Stability Report December 2009 | Iss...
9,December 2010,Financial Stability Report December 2010 | Iss...


In [32]:
# Put the original dates back in

#Convert fsr_dates to datetime for easier manipulation
fsr_dates_dt = pd.to_datetime(fsr_dates, format='%d/%m/%Y')

# Create a mapping from month-year to the original date
# Ceate a month-year representation of mpr_dates
month_year = fsr_dates_dt.strftime('%B %Y')
# Create a dictionary where key is 'Month Year' and value is the original date (as string for direct replacement)
date_mapping = dict(zip(month_year, fsr_dates))

# Replace the dates in monetary_policy_reports using the mapping
fsr['Date'] = fsr['Date'].map(date_mapping)

fsr

Unnamed: 0,Date,Full_Text
0,26/04/2007,Financial Stability Report April 2007 | Issue ...
1,06/08/2020,Financial Policy Committee Financial Stability...
2,14/12/2000,Financial Stability Review December 2000 Bank ...
3,13/12/2001,Financial Stability Review December 2001 Bank ...
4,12/12/2002,Financial Stability Review December 2002 Bank ...
5,11/12/2003,Financial Stability Review December 2003 Bank ...
6,13/12/2004,Financial Stability Review December 2004 Bank ...
7,13/12/2005,Financial Stability Review December 2005 Bank ...
8,18/12/2009,Financial Stability Report December 2009 | Iss...
9,17/12/2010,Financial Stability Report December 2010 | Iss...


In [33]:
# Sort by date

# Convert the 'Date' column to datetime format for sorting
fsr['Date'] = pd.to_datetime(fsr['Date'], format='%d/%m/%Y')

# Sort the DataFrame by the 'Date' column in place
fsr.sort_values(by='Date', inplace=True)

# Convert the 'Date' column back to the desired string format
fsr['Date'] = fsr['Date'].dt.strftime('%d/%m/%Y')

# Reset the index of the DataFrame in place
fsr.reset_index(drop=True, inplace=True)

# Display the dataframe
fsr

Unnamed: 0,Date,Full_Text
0,18/06/1999,Financial Stability Review June 1999 Bank of E...
1,26/11/1999,Financial Stability Review November 1999 Bank ...
2,29/06/2000,Financial Stability Review June 2000 Bank of E...
3,14/12/2000,Financial Stability Review December 2000 Bank ...
4,28/06/2001,Financial Stability Review June 2001 Bank of E...
5,13/12/2001,Financial Stability Review December 2001 Bank ...
6,27/06/2002,Financial Stability Review June 2002 Bank of E...
7,12/12/2002,Financial Stability Review December 2002 Bank ...
8,26/06/2003,Financial Stability Review June 2003 Bank of E...
9,11/12/2003,Financial Stability Review December 2003 Bank ...


In [34]:
# Save the file
fsr.to_csv('financial_stability_reports.csv', index=False)

In [35]:
# Load the files back to check
checkfsr =  pd.read_csv('financial_stability_reports.csv')
checkmpr = pd.read_csv('monetary_policy_reports.csv')

# Set display width to 200
pd.set_option('display.width', 400)

In [36]:
# View the FSR data
checkfsr

Unnamed: 0,Date,Full_Text
0,18/06/1999,Financial Stability Review June 1999 Bank of E...
1,26/11/1999,Financial Stability Review November 1999 Bank ...
2,29/06/2000,Financial Stability Review June 2000 Bank of E...
3,14/12/2000,Financial Stability Review December 2000 Bank ...
4,28/06/2001,Financial Stability Review June 2001 Bank of E...
5,13/12/2001,Financial Stability Review December 2001 Bank ...
6,27/06/2002,Financial Stability Review June 2002 Bank of E...
7,12/12/2002,Financial Stability Review December 2002 Bank ...
8,26/06/2003,Financial Stability Review June 2003 Bank of E...
9,11/12/2003,Financial Stability Review December 2003 Bank ...


In [37]:
# View the MPR data
checkmpr

Unnamed: 0,Date,Full_Text
0,10/02/1999,Inflation Report February 1999 The Inflation R...
1,12/05/1999,Inflation Report May 1999 The Inflation Report...
2,11/08/1999,Inflation Report August 1999 The Inflation Rep...
3,10/11/1999,Inflation Report November 1999 The Inflation R...
4,17/02/2000,Inflation Report February 2000 The Inflation R...
...,...,...
91,04/11/2021,Monetary Policy Committee Monetary Policy Repo...
92,03/02/2022,Monetary Policy Committee Monetary Policy Repo...
93,05/05/2022,Bank of England Monetary Policy Report Moneta...
94,04/08/2022,Bank of England Monetary Policy Report Moneta...
