In [56]:
import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("monese-statement-5.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)

                        0                     1    2                3      4
0    Stuart George Mcavoy        Account number  NaN              NaN    NaN
1           43 leigh road              06163525  NaN              NaN    NaN
2                 ME3 8NU                        NaN              NaN    NaN
3               Rochester             Sort code  NaN              NaN    NaN
4          United Kingdom              23-69-72  NaN              NaN    NaN
..                    ...                   ...  ...              ...    ...
514                                                   Closing balance  £1.63
515            Monese LTD  We are here for you!                             
516         1 King Street        www.monese.com                             
517                London  +44 (0) 1706 304 001                             
518              EC2V 8AU    support@monese.com                             

[519 rows x 5 columns]


In [57]:
df_cleaned = df_cleaned[df_cleaned[1].notna() & (df_cleaned[1] != "")]
df_cleaned.head(10)

Unnamed: 0,0,1,2,3,4
0,Stuart George Mcavoy,Account number,,,
1,43 leigh road,06163525,,,
3,Rochester,Sort code,,,
4,United Kingdom,23-69-72,,,
5,,Monese ID,,,
6,,M3259007,,,
7,Processed date,Payment made,Description,Amount,Balance
8,01/08/2021,01/08/2021,LUNNISS LE,+£30.00,£85.42
10,02/08/2021,31/07/2021,Wilko Retail Limited,-£5.50,£79.92
11,,31/07/2021,Asda Stores 4472,-£15.92,£64.00


In [58]:
# Drop the top 6 rows
df_cleaned = df.drop(df.index[:8])

In [59]:
df_cleaned.head(10)

Unnamed: 0,0,1,2,3,4
8,01/08/2021,01/08/2021,LUNNISS LE,+£30.00,£85.42
9,,,20-29-90 43623408 | ETHERNET,,
10,02/08/2021,31/07/2021,Wilko Retail Limited,-£5.50,£79.92
11,,31/07/2021,Asda Stores 4472,-£15.92,£64.00
12,,01/08/2021,Tesco Pfs 5238,-£10.65,£53.35
13,03/08/2021,03/08/2021,Julia Mc Avoy,+£100.00,£153.35
14,,,87-70-08 06021573 | Stewart McAvoy,,
15,04/08/2021,03/08/2021,Cardtronicsuk,-£30.00,£123.35
16,,03/08/2021,Tesco Stores 5354,-£9.62,£113.73
17,,03/08/2021,Tesco Stores 6215,-£2.85,£110.88


In [416]:
# # Drop columns with index 6 and 7
# df_cleaned = df_cleaned.drop([6], axis=1)

# # Reset index if needed
# df_cleaned.reset_index(drop=True, inplace=True)

In [60]:
df_cleaned

Unnamed: 0,0,1,2,3,4
8,01/08/2021,01/08/2021,LUNNISS LE,+£30.00,£85.42
9,,,20-29-90 43623408 | ETHERNET,,
10,02/08/2021,31/07/2021,Wilko Retail Limited,-£5.50,£79.92
11,,31/07/2021,Asda Stores 4472,-£15.92,£64.00
12,,01/08/2021,Tesco Pfs 5238,-£10.65,£53.35
...,...,...,...,...,...
514,,,,Closing balance,£1.63
515,Monese LTD,We are here for you!,,,
516,1 King Street,www.monese.com,,,
517,London,+44 (0) 1706 304 001,,,


In [35]:
df_cleaned.tail(40)

Unnamed: 0,0,1,2,3,4
365,,14/01/2022,B&m 707,-£6.52,£98.01
366,,15/01/2022,MCNEILL BG,+£20.00,£118.01
367,,,83-26-10 00656997 | JAXX JOEY,,
368,16/01/2022,15/01/2022,B&m 707,-£12.72,£105.29
369,,15/01/2022,Just Eat,-£17.95,£87.34
370,17/01/2022,17/01/2022,HMRC CHILD BENEFIT,+£49.15,£136.49
371,,,20-26-48 13031802 | WILSON0KIMBE980216,,
372,,17/01/2022,AA INSURANCE,-£30.69,£105.80
373,,,20-05-26 23796957 | AA 130900725 2021,,
374,18/01/2022,17/01/2022,Bulb Energy,-£20.00,£85.80


In [61]:
df = df_cleaned
df = df[df[0].str.strip() != "Date"]
df

Unnamed: 0,0,1,2,3,4
8,01/08/2021,01/08/2021,LUNNISS LE,+£30.00,£85.42
9,,,20-29-90 43623408 | ETHERNET,,
10,02/08/2021,31/07/2021,Wilko Retail Limited,-£5.50,£79.92
11,,31/07/2021,Asda Stores 4472,-£15.92,£64.00
12,,01/08/2021,Tesco Pfs 5238,-£10.65,£53.35
...,...,...,...,...,...
514,,,,Closing balance,£1.63
515,Monese LTD,We are here for you!,,,
516,1 King Street,www.monese.com,,,
517,London,+44 (0) 1706 304 001,,,


In [62]:
df = df[df[0].str.strip() != "Date"]
# Set the first row as column names
df.columns = df.iloc[0]

# Drop the first row
#df = df[1:]

# Reset index
df.reset_index(drop=True, inplace=True)

# Clean any leading or trailing spaces in column names
df.columns = df.columns.str.strip()

In [63]:
# Specify new column names
new_column_names = ["index","Date", "Description", "Amount",  "Balance"]

# Assign new column names to the DataFrame
df.columns = new_column_names

# Optionally, drop any rows that are not needed (e.g., headers or footers)
# df = df.drop(0).reset_index(drop=True)
df.head(10)

Unnamed: 0,index,Date,Description,Amount,Balance
0,01/08/2021,01/08/2021,LUNNISS LE,+£30.00,£85.42
1,,,20-29-90 43623408 | ETHERNET,,
2,02/08/2021,31/07/2021,Wilko Retail Limited,-£5.50,£79.92
3,,31/07/2021,Asda Stores 4472,-£15.92,£64.00
4,,01/08/2021,Tesco Pfs 5238,-£10.65,£53.35
5,03/08/2021,03/08/2021,Julia Mc Avoy,+£100.00,£153.35
6,,,87-70-08 06021573 | Stewart McAvoy,,
7,04/08/2021,03/08/2021,Cardtronicsuk,-£30.00,£123.35
8,,03/08/2021,Tesco Stores 5354,-£9.62,£113.73
9,,03/08/2021,Tesco Stores 6215,-£2.85,£110.88


In [408]:
df['Balance'] = df['Balance'].str.rstrip('.')
df['Money Out'] = df['Money Out'].str.rstrip('.')
df['Money In'] = df['Money In'].str.rstrip('.')

KeyError: 'Money Out'

In [None]:
# # Function to clean trailing periods from numerical values
# def clean_numeric_values(value):
#     if isinstance(value, str):
#         return value.strip('.')
#     return value

# # Apply the cleaning function to relevant columns
# columns_to_clean = ["Money In", "Money Out", "Balance"]
# for column in columns_to_clean:
#     df[column] = df[column].apply(clean_numeric_values)

# # Convert columns to numeric types if needed
# df["Money In"] = pd.to_numeric(df["Money In"], errors='coerce')
# df["Money Out"] = pd.to_numeric(df["Money Out"], errors='coerce')
# df["Balance"] = pd.to_numeric(df["Balance"], errors='coerce')

In [64]:
df.tail(20)

Unnamed: 0,index,Date,Description,Amount,Balance
491,,,16-31-30 27591645 | TAXI,,
492,,13/11/2021,Cardtronicsuk,-£50.00,£75.53
493,,14/11/2021,Uber* Trip,-£8.22,£67.31
494,15/11/2021,14/11/2021,Amazon Prime*jv0343vz5,-£7.99,£59.32
495,,14/11/2021,Uber *trip,-£8.39,£50.93
496,,14/11/2021,Uber *trip,-£7.63,£43.30
497,,14/11/2021,Mfg Gibraltar,-£2.34,£40.96
498,16/11/2021,15/11/2021,Tesco Store 3078,-£4.57,£36.39
499,,15/11/2021,Tesco Store 3078,-£2.25,£34.14
500,,15/11/2021,Tesco Pfs 5238,-£6.50,£27.64


In [65]:
df = df[:-4]
df.tail(10)

Unnamed: 0,index,Date,Description,Amount,Balance
497,,14/11/2021,Mfg Gibraltar,-£2.34,£40.96
498,16/11/2021,15/11/2021,Tesco Store 3078,-£4.57,£36.39
499,,15/11/2021,Tesco Store 3078,-£2.25,£34.14
500,,15/11/2021,Tesco Pfs 5238,-£6.50,£27.64
501,,15/11/2021,Uber *trip,-£5.72,£21.92
502,,15/11/2021,Uber *trip,-£5.71,£16.21
503,,15/11/2021,Uber *trip,-£5.78,£10.43
504,,15/11/2021,Se Strood Sst,-£5.50,£4.93
505,,15/11/2021,Greggs Plc,-£3.30,£1.63
506,,,,Closing balance,£1.63


In [381]:
# Function to validate and convert dates
def is_valid_date(date_str):
    try:
        pd.to_datetime(date_str, format='%d %b %y')
        return True
    except ValueError:
        return False

# Remove trailing periods and check date validity
df['Date'] = df['Date'].str.replace('.', '', regex=False)
df['Date_valid'] = df['Date'].apply(is_valid_date)

# Filter out rows with invalid dates
df = df[df['Date_valid']].drop(columns='Date_valid')
df.tail(20)

Unnamed: 0,index,Date,Description,Amount,Balance
162,,,40-05-22 31588214 | A POOLE RESERVED,,
164,,,04-06-05 14761918 | SELF FUNDING,,
174,,,11-00-01 06899126,,
178,,,30-91-56 00465509 | 4110589889 A POOLE,,
180,,,11-00-01 06899126 | OWED,,
206,,,20-49-29 33334562 | BBQ,,
240,,,40-05-22 31588214 | A POOLE FULL PAYME,,
252,,,20-80-45 23709310 | 000000000044704744,,
283,,,60-15-28 66688736 | RENT AND BILLS,,
289,,,LIMITED,,


In [41]:
df.tail()

Unnamed: 0,Date,Description,Amount,Balance
400,24/01/2022,Nya*air serv,-£0.50,£66.33
401,We are here for you!,,,
402,www.monese.com,,,
403,+44 (0) 1706 304 001,,,
404,support@monese.com,,,


In [66]:
# Drop the 'Processed date' column
df = df.drop(columns=["index"])

In [67]:
df

Unnamed: 0,Date,Description,Amount,Balance
0,01/08/2021,LUNNISS LE,+£30.00,£85.42
1,,20-29-90 43623408 | ETHERNET,,
2,31/07/2021,Wilko Retail Limited,-£5.50,£79.92
3,31/07/2021,Asda Stores 4472,-£15.92,£64.00
4,01/08/2021,Tesco Pfs 5238,-£10.65,£53.35
...,...,...,...,...
502,15/11/2021,Uber *trip,-£5.71,£16.21
503,15/11/2021,Uber *trip,-£5.78,£10.43
504,15/11/2021,Se Strood Sst,-£5.50,£4.93
505,15/11/2021,Greggs Plc,-£3.30,£1.63


In [68]:
file_path = "F:/python project/monese-statement-5.csv"

In [69]:
df.to_csv(file_path, index=False)