In [1]:
import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("Bank of Scotland 1.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)


              0                    1       2                     3  \
0        Column               Column  Column                Column   
1         Date.         Description.   Type.         Money In (£).   
2          Date          Description    Type                         
3                                             Money In (£)\nblank.   
4    01 Jun 21.         THE WAY INN.    DEB.                         
..          ...                  ...     ...                   ...   
353                                           Money In (£)\nblank.   
354  29 Jun 21.  SAVETHECHANGE-0160.     BP.                         
355        Date          Description    Type                         
356                                           Money In (£)\nblank.   
357  30 Jun 21.      WWW.SKYBET.COM.    DEB.                         

                  4             5  
0            Column        Column  
1    Money Out (£).  Balance (£).  
2     Money Out (£)   Balance (£)  
3              

In [3]:
df_cleaned = df_cleaned[df_cleaned[1].notna() & (df_cleaned[1] != "")]
df = df_cleaned.drop(0).reset_index(drop=True)
df

Unnamed: 0,0,1,2,3,4,5
0,Date.,Description.,Type.,Money In (£).,Money Out (£).,Balance (£).
1,Date,Description,Type,,Money Out (£),Balance (£)
2,01 Jun 21.,THE WAY INN.,DEB.,,21.07.,945.81.
3,Date,Description,Type,,Money Out (£),Balance (£)
4,01 Jun 21.,AMAZON.CO.UK*2T4MA.,DEB.,,24.00.,921.81.
...,...,...,...,...,...,...
236,28 Jun 21.,THE KIDS STOP.,FPO.,,786.27.,904.80.
237,Date,Description,Type,,Money Out (£),Balance (£)
238,29 Jun 21.,SAVETHECHANGE-0160.,BP.,,1.29.,903.51.
239,Date,Description,Type,,Money Out (£),Balance (£)


In [4]:
df = df[df[0].str.strip() != "Date"]
# Set the first row as column names
df.columns = df.iloc[0]

# Drop the first row
df = df[1:]

# Reset index
df.reset_index(drop=True, inplace=True)

# Clean any leading or trailing spaces in column names
df.columns = df.columns.str.strip()

In [5]:
# Specify new column names
new_column_names = ["Date", "Description", "Type", "Money In", "Money Out", "Balance"]

# Assign new column names to the DataFrame
df.columns = new_column_names

# Optionally, drop any rows that are not needed (e.g., headers or footers)
df = df.drop(0).reset_index(drop=True)
df.head(10)

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,01 Jun 21.,AMAZON.CO.UK*2T4MA.,DEB.,,24.00.,921.81.
1,01 Jun 21.,CO-OP GROUP FOOD.,DEB.,,26.86.,894.95.
2,01 Jun 21.,CO-OP GROUP FOOD.,DEB.,,31.60.,863.35.
3,01 Jun 21.,SAVETHECHANGE-0160.,BP.,,3.32.,860.03.
4,01 Jun 21.,DVLA-WL09BNE.,DD.,,17.93.,842.10.
5,01 Jun 21.,WEST LOTHIAN LEISU.,DD.,,41.00.,801.10.
6,01 Jun 21.,LETTING SOLUTIONS.,FPO.,,795.00.,6.10.
7,01 Jun 21.,CORBIE INN.,DEB.,,3.60.,2.50.
8,01 Jun 21.,CORBIE INN.,DEB.,,3.60.,-1.10.
9,01 Jun 21.,CORBIE INN.,DEB.,,3.60.,-4.70.


In [6]:
# Remove rows where column 0 has the value "Date" (with inplace=True)
df.drop(df[df['Date'].str.strip() == "Date."].index, inplace=True)
df.drop(df[df['Date'].str.strip() == "Date."].index, inplace=True)
df.drop(df[df['Date'].str.strip() == "Column"].index, inplace=True)
df.drop(df[df['Money In'].str.strip() == "Money In (£)\nblank."].index, inplace=True)
df.drop(df[df['Money Out'].str.strip() == "Money Out (£)\nblank."].index, inplace=True)
# Reset the index to maintain continuous indexing after dropping rows
df.reset_index(drop=True, inplace=True)

In [7]:
df

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,01 Jun 21.,AMAZON.CO.UK*2T4MA.,DEB.,,24.00.,921.81.
1,01 Jun 21.,CO-OP GROUP FOOD.,DEB.,,26.86.,894.95.
2,01 Jun 21.,CO-OP GROUP FOOD.,DEB.,,31.60.,863.35.
3,01 Jun 21.,SAVETHECHANGE-0160.,BP.,,3.32.,860.03.
4,01 Jun 21.,DVLA-WL09BNE.,DD.,,17.93.,842.10.
...,...,...,...,...,...,...
109,28 Jun 21.,MFG CROSSROADS.,DEB.,,38.51.,"2,486.07."
110,28 Jun 21.,LETTING SOLUTIONS.,FPO.,,795.00.,"1,691.07."
111,28 Jun 21.,THE KIDS STOP.,FPO.,,786.27.,904.80.
112,29 Jun 21.,SAVETHECHANGE-0160.,BP.,,1.29.,903.51.


In [8]:
# Function to clean trailing periods from numerical values
def clean_numeric_values(value):
    if isinstance(value, str):
        return value.strip('.')
    return value

# Apply the cleaning function to relevant columns
columns_to_clean = ["Money In", "Money Out", "Balance"]
for column in columns_to_clean:
    df[column] = df[column].apply(clean_numeric_values)

# Convert columns to numeric types if needed
df["Money In"] = pd.to_numeric(df["Money In"], errors='coerce')
df["Money Out"] = pd.to_numeric(df["Money Out"], errors='coerce')
df["Balance"] = pd.to_numeric(df["Balance"], errors='coerce')

In [10]:
df

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,01 Jun 21.,AMAZON.CO.UK*2T4MA.,DEB.,,24.00,921.81
1,01 Jun 21.,CO-OP GROUP FOOD.,DEB.,,26.86,894.95
2,01 Jun 21.,CO-OP GROUP FOOD.,DEB.,,31.60,863.35
3,01 Jun 21.,SAVETHECHANGE-0160.,BP.,,3.32,860.03
4,01 Jun 21.,DVLA-WL09BNE.,DD.,,17.93,842.10
...,...,...,...,...,...,...
109,28 Jun 21.,MFG CROSSROADS.,DEB.,,38.51,
110,28 Jun 21.,LETTING SOLUTIONS.,FPO.,,795.00,
111,28 Jun 21.,THE KIDS STOP.,FPO.,,786.27,904.80
112,29 Jun 21.,SAVETHECHANGE-0160.,BP.,,1.29,903.51
