In [59]:
import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("Bank-of-Scotland-classic-vantage-3.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)

                                                     0                   1  \
0                                               Column              Column   
1                                                Date.        Description.   
2                                                 Date         Description   
3                                                                            
4                                           02 Jun 21.  REPORTCURVE.CO.UK.   
..                                                 ...                 ...   
135  If you think something is incorrect, please co...                       
136  Bank of Scotland plc is registered in Scotland...                       
137  Prudential Regulation Authority and regulated ...                       
138  registration no. 169628. We adhere to The Stan...                       
139                   www.lendingstandardsboard.org.uk                       

          2                     3               4             5

In [60]:
df_cleaned = df_cleaned[df_cleaned[1].notna() & (df_cleaned[1] != "")]
df = df_cleaned.drop(0).reset_index(drop=True)
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,Date.,Description.,Type.,Money In (£).,Money Out (£).,Balance (£).,,
1,Date,Description,Type,,Money Out (£),Balance (£),,
2,02 Jun 21.,REPORTCURVE.CO.UK.,DEB.,,0.11.,-42.91.,,
3,Date,Description,Type,Money In (£),,Balance (£),,
4,16 Jun 21.,DUMFRIES HIGH ST (.,DEP.,727.28.,,684.37.,,
...,...,...,...,...,...,...,...,...
82,30 Jun 21.,CONNOR GRIFFIN.,FPO.,,17.00.,0.19.,,
83,BGC.,Bank Giro Credit.,BP.,Bill Payments.,CHG. Charge.,,CHQ. Cheque.,
84,DEP.,Deposit.,FEE.,Fixed Service,FPI.\nFaster Payment In.,FPO.,,Faster Payment Out.
85,MPI.,Mobile Payment In.,,MPO. Mobile Payment Out.,PAY.\nPayment.,SO.,Standing Order.,


In [61]:
# Drop columns with index 6 and 7
df = df.drop([6,7], axis=1)

# Reset index if needed
df.reset_index(drop=True, inplace=True)

In [62]:
df = df[df[0].str.strip() != "Date"]
# Set the first row as column names
df.columns = df.iloc[0]

# Drop the first row
df = df[1:]

# Reset index
df.reset_index(drop=True, inplace=True)

# Clean any leading or trailing spaces in column names
df.columns = df.columns.str.strip()

In [63]:
df

Unnamed: 0,Date.,Description.,Type.,Money In (£).,Money Out (£).,Balance (£).
0,02 Jun 21.,REPORTCURVE.CO.UK.,DEB.,,0.11.,-42.91.
1,16 Jun 21.,DUMFRIES HIGH ST (.,DEP.,727.28.,,684.37.
2,17 Jun 21.,CONNOR GRIFFIN.,FPO.,,10.00.,674.37.
3,17 Jun 21.,CONNOR GRIFFIN.,FPO.,,10.00.,664.37.
4,18 Jun 21.,LEE CURRIE.,FPO.,,15.00.,649.37.
5,18 Jun 21.,Sweet Sadie s.,DEB.,,10.80.,638.57.
6,18 Jun 21.,NEW ORIENTAL.,DEB.,,11.30.,627.27.
7,18 Jun 21.,ST MICHAELS SERVIC.,DEB.,,52.94.,574.33.
8,18 Jun 21.,LEE CURRIE.,FPO.,,409.00.,165.33.
9,18 Jun 21.,LEE CURRIE.,FPI.,50.00.,,215.33.


In [64]:
# Specify new column names
new_column_names = ["Date", "Description", "Type", "Money In", "Money Out", "Balance"]

# Assign new column names to the DataFrame
df.columns = new_column_names

# Optionally, drop any rows that are not needed (e.g., headers or footers)
df = df.drop(0).reset_index(drop=True)
df.head(10)

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,16 Jun 21.,DUMFRIES HIGH ST (.,DEP.,727.28.,,684.37.
1,17 Jun 21.,CONNOR GRIFFIN.,FPO.,,10.00.,674.37.
2,17 Jun 21.,CONNOR GRIFFIN.,FPO.,,10.00.,664.37.
3,18 Jun 21.,LEE CURRIE.,FPO.,,15.00.,649.37.
4,18 Jun 21.,Sweet Sadie s.,DEB.,,10.80.,638.57.
5,18 Jun 21.,NEW ORIENTAL.,DEB.,,11.30.,627.27.
6,18 Jun 21.,ST MICHAELS SERVIC.,DEB.,,52.94.,574.33.
7,18 Jun 21.,LEE CURRIE.,FPO.,,409.00.,165.33.
8,18 Jun 21.,LEE CURRIE.,FPI.,50.00.,,215.33.
9,18 Jun 21.,LNK BGL LTD.,CPT.,,50.00.,165.33.


In [65]:
# Remove rows where column 0 has the value "Date" (with inplace=True)
df.drop(df[df['Date'].str.strip() == "Date."].index, inplace=True)
df.drop(df[df['Date'].str.strip() == "Date."].index, inplace=True)
df.drop(df[df['Date'].str.strip() == "Column"].index, inplace=True)
df.drop(df[df['Money In'].str.strip() == "Money In (£)\nblank."].index, inplace=True)
df.drop(df[df['Money Out'].str.strip() == "Money Out (£)\nblank."].index, inplace=True)
# Reset the index to maintain continuous indexing after dropping rows
df.reset_index(drop=True, inplace=True)

In [66]:
df

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,16 Jun 21.,DUMFRIES HIGH ST (.,DEP.,727.28.,,684.37.
1,17 Jun 21.,CONNOR GRIFFIN.,FPO.,,10.00.,674.37.
2,17 Jun 21.,CONNOR GRIFFIN.,FPO.,,10.00.,664.37.
3,18 Jun 21.,LEE CURRIE.,FPO.,,15.00.,649.37.
4,18 Jun 21.,Sweet Sadie s.,DEB.,,10.80.,638.57.
5,18 Jun 21.,NEW ORIENTAL.,DEB.,,11.30.,627.27.
6,18 Jun 21.,ST MICHAELS SERVIC.,DEB.,,52.94.,574.33.
7,18 Jun 21.,LEE CURRIE.,FPO.,,409.00.,165.33.
8,18 Jun 21.,LEE CURRIE.,FPI.,50.00.,,215.33.
9,18 Jun 21.,LNK BGL LTD.,CPT.,,50.00.,165.33.


In [67]:
# Function to validate and convert dates
def is_valid_date(date_str):
    try:
        pd.to_datetime(date_str, format='%d %b %y')
        return True
    except ValueError:
        return False

# Remove trailing periods and check date validity
df['Date'] = df['Date'].str.replace('.', '', regex=False)
df['Date_valid'] = df['Date'].apply(is_valid_date)

# Filter out rows with invalid dates
df = df[df['Date_valid']].drop(columns='Date_valid')
df.tail(10)

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
29,21 Jun 21,LNK BGL LTD.,CPT.,,30.00.,210.42.
30,22 Jun 21,TESCO PAYAT PUMP 4.,DEB.,,37.18.,173.24.
31,22 Jun 21,CARLISLE.,DEB.,,40.00.,133.24.
32,22 Jun 21,CARLISLE.,DEB.,,94.00.,39.24.
33,23 Jun 21,MARCHBANK BAKERS.,DEB.,,4.80.,34.44.
34,24 Jun 21,REPORTCURVE.CO.UK.,DEB.,,29.25.,5.19.
35,28 Jun 21,TUI UK LTD.,DD.,,149.45.,-144.26.
36,28 Jun 21,RETURNED DD.,,149.45.,,5.19.
37,30 Jun 21,LEE CURRIE.,FPI.,12.00.,,17.19.
38,30 Jun 21,CONNOR GRIFFIN.,FPO.,,17.00.,0.19.


In [68]:
# Function to clean trailing periods from numerical values
def clean_numeric_values(value):
    if isinstance(value, str):
        return value.strip('.')
    return value

# Apply the cleaning function to relevant columns
columns_to_clean = ["Money In", "Money Out", "Balance"]
for column in columns_to_clean:
    df[column] = df[column].apply(clean_numeric_values)

# Convert columns to numeric types if needed
df["Money In"] = pd.to_numeric(df["Money In"], errors='coerce')
df["Money Out"] = pd.to_numeric(df["Money Out"], errors='coerce')
df["Balance"] = pd.to_numeric(df["Balance"], errors='coerce')

In [69]:
df

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,16 Jun 21,DUMFRIES HIGH ST (.,DEP.,727.28,,684.37
1,17 Jun 21,CONNOR GRIFFIN.,FPO.,,10.0,674.37
2,17 Jun 21,CONNOR GRIFFIN.,FPO.,,10.0,664.37
3,18 Jun 21,LEE CURRIE.,FPO.,,15.0,649.37
4,18 Jun 21,Sweet Sadie s.,DEB.,,10.8,638.57
5,18 Jun 21,NEW ORIENTAL.,DEB.,,11.3,627.27
6,18 Jun 21,ST MICHAELS SERVIC.,DEB.,,52.94,574.33
7,18 Jun 21,LEE CURRIE.,FPO.,,409.0,165.33
8,18 Jun 21,LEE CURRIE.,FPI.,50.0,,215.33
9,18 Jun 21,LNK BGL LTD.,CPT.,,50.0,165.33


In [70]:
df = df.fillna('')

In [71]:
file_path = "F:/python project/Bank-of-Scotland-classic-vantage-3.csv"

In [72]:
df.to_csv(file_path, index=False)