https://www.geeksforgeeks.org/working-with-pdf-files-in-python/

In [138]:
import re
import pandas as pd
from pypdf import PdfReader

In [139]:
reader = PdfReader("asda_statements/February 2024.pdf")

date_pattern            = r"(\d{1,2}\s*[a-zA-Z]{3})"
description_pattern     = r"([^£-]+)"
money_pattern           = r"£(\d+\.\d{2})"
row_pattern             = r"\s+".join([date_pattern, date_pattern, description_pattern, money_pattern])
year_pattern            = r"Statement\s*date:\s*\d{1,2}\s*[a-zA-Z]+\s*(\d{4})"
total_amount_pattern    = r"Total for new transactions\s+£(\d+\.\d{2})"

page = reader.pages[0] 
text = page.extract_text()

In [140]:
rows = []
year = None
total_amount = None
for page in reader.pages:
    text = page.extract_text()
    text = re.sub(r"\n", " ", text)
    rows += re.findall(row_pattern, text)
    years = re.findall(year_pattern, text)
    if years:
        year = years[0]
    total_amounts = re.findall(total_amount_pattern, text)
    if total_amounts:
        total_amount = total_amounts[0]

In [141]:
col_names = ("transaction_date", "posting_date", "description", "amount")
df = pd.DataFrame(rows, columns=col_names)
df["transaction_date"] = pd.to_datetime(df.transaction_date + f" {year}", format="%d %b %Y")
df["posting_date"] = pd.to_datetime(df.posting_date + f" {year}", format="%d %b %Y")
df["amount"] = pd.to_numeric(df.amount)

In [142]:
df.dtypes

transaction_date    datetime64[ns]
posting_date        datetime64[ns]
description                 object
amount                     float64
dtype: object

In [144]:
calculated_total_amount = df.amount.sum()

if calculated_total_amount != float(total_amount):
    raise Exception(f"Calculated total {calculated_total_amount} does not equal given total {total_amount}")

In [145]:
df

Unnamed: 0,transaction_date,posting_date,description,amount
0,2024-01-22,2024-01-23,TFL TRAVEL CH TFL.GOV.UK/CP,5.5
1,2024-01-24,2024-01-25,ALDI 71 775 READING,3.55
2,2024-01-27,2024-01-29,ALDI 71 775 READING,1.58
3,2024-01-27,2024-01-29,ALDI 71 775 READING,11.94
4,2024-01-29,2024-01-30,SUMUP *SOUTH READING JUDREADING,6.5
5,2024-01-30,2024-01-31,ALDI 71 775 READING,12.95
6,2024-02-01,2024-02-02,TFL TRAVEL CH TFL.GOV.UK/CP,25.75
7,2024-02-02,2024-02-05,ALDI 71 775 READING,12.95
8,2024-02-05,2024-02-06,SUMUP *SOUTH READING JUDREADING,6.5
9,2024-02-06,2024-02-07,ALDI 71 775 READING,5.73
