In [144]:
import camelot
import pandas as pd

def process_bank_statement_one(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame (drop empty rows and columns)
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop irrelevant rows (based on the index, adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:111])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned



def process_bank_statement_classic(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 112 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:112])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned



def process_bank_statement_club_lloyds(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 127 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:127])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned



def process_bank_statement_standard_saver(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 138 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:138])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned


def process_bank_statement_standard_saver_2(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 138 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:138])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned


# 
def process_bank_statement(pdf_path):
    """
    Processes a bank statement PDF and returns a cleaned DataFrame.

    Parameters:
        pdf_path (str): Path to the PDF file.

    Returns:
        pd.DataFrame: Cleaned and structured DataFrame with relevant columns.
    """
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Drop any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)

    # Reset index and process DataFrame
    df_cleaned.reset_index(drop=True, inplace=True)

    # Remove unwanted initial rows (specific to this dataset)
    df = df_cleaned.drop(df_cleaned.index[:112])

    # Set the first row as the header (column names)
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    # Define a helper function to check if a value is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Filter rows where the "Date" column contains valid dates
    df = df[df['Date'].apply(is_date)].reset_index(drop=True)

    # Drop the last column if not needed
    df = df.drop(df.columns[-1], axis=1)

    # Rename columns to standard names
    df = df.rename(columns={
        'Date': 'Date',
        'Payment type\nDetails': 'Description',
        'Money Out ([)': 'Withdraw',
        'Money In ([)': 'Credit',
        'Balance ([)': 'Balance',
        # Add more mappings if necessary
    })

    return df




# Example usage:
pdf_path = "left/Lloyds-bank-statement-standard-saver-2.pdf"
cleaned_df = process_bank_statement_standard_saver_2(pdf_path)
cleaned_df.head()


# Example usage:
pdf_path = "left/Lloyds-bank-statement-standard-saver-1.pdf"
cleaned_df = process_bank_statement_standard_saver(pdf_path)
cleaned_df.head()


# Example usage:
pdf_path = "left/Lloyds-bank-statement-club-lloyds-1.pdf"
cleaned_df = process_bank_statement_club_lloyds(pdf_path)
cleaned_df.head()


# Example usage:
pdf_path = "left/Lloyds-bank-statement-classic-1.pdf"
cleaned_df = process_bank_statement_classic(pdf_path)
cleaned_df.head()



# Example usage:
pdf_path = "left/Lloyds-bank-statement-basic-1.pdf"
cleaned_df = process_bank_statement_one(pdf_path)
cleaned_df.head()


111,Date,Description,Withdraw,Credit,Balance
0,03 Dec 21,STATEMENT OPENING BALANCE,,,485.56
1,06 Dec 21,TAPPILY\n04DEC21 SN3386516923022E2E,,20.0,505.56
2,06 Dec 21,MARTIN MCCOLL CD 2817\n04DEC21,13.94,,491.62
3,06 Dec 21,NETFLIX.COM CD 2817\n04DEC21,13.99,,477.63
4,06 Dec 21,ASDA STORES LTD 58 CD 2817\n04DEC21,19.65,,457.98


In [23]:
# Lloyds-bank-statement-basic-1

In [71]:
# Lloyds-bank-statement-basic-1

import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("left/Lloyds-bank-statement-basic-1.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)

df = df_cleaned
df = df.drop(df.index[:111])

# Set the first row as the header (column names)
df.columns = df.iloc[0]
# Remove the first row from the DataFrame
df = df[1:].reset_index(drop=True)

# Drop the 'Pmnt' column
df = df.drop(columns=['Pmnt'])

# Define a function to check if the value in the "Date" column is a valid date
def is_date(value):
    if pd.isna(value) or value == "":
        return False
    try:
        pd.to_datetime(value, format="%d %b %y", errors="raise")
        return True
    except ValueError:
        return False

# Apply the function to filter only rows where "Date" is a valid date and not empty
df = df[df['Date'].apply(is_date)]

# Reset index after filtering, if needed
df = df.reset_index(drop=True)

# Assuming df is your DataFrame
df = df.rename(columns={
    'Date': 'Date',
    'Details': 'Description',
    'Money Out (£)': 'Withdraw',
    'Money In (£)': 'Credit',
    'Balance (£)': 'Balance',
    # Add more columns as needed
})

                       0    1                                        2  \
0                                                                        
1                                                  Statement number 11   
2                                                           Issue date   
3             MS H S WYE                                Write to us at   
4    16 WILTSHIRE BARTON                                                 
..                   ...  ...                                      ...   
224            31 Dec 21  FPO      MISS M NEWTON-WYE MUM 31DEC21 12:16   
225            31 Dec 21  FPO  MR STEFAN J CROXAL HAYLEY 31DEC21 13:14   
226            31 Dec 21  CPT          LNK ASDA FROME2 CD 2817 31DEC21   
227            31 Dec 21  FPI               TAPPILY DF2136501974620700   
228            31 Dec 21                     STATEMENT CLOSING BALANCE   

                    3         4       5  
0         Page 1 of 5       NaN     NaN  
1                          

In [None]:
# Lloyds-bank-statement-basic-1

import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("left/Lloyds-bank-statement-basic-1.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)

In [72]:
df = df_cleaned

In [73]:
df = df_cleaned
df = df.drop(df.index[:111])
df.head(20)

Unnamed: 0,0,1,2,3,4,5
111,Date,Pmnt,Details,Money Out (£),Money In (£),Balance (£)
112,,Type,,,,
113,03 Dec 21,,STATEMENT OPENING BALANCE,,,485.56
114,06 Dec 21,FPI,TAPPILY\n04DEC21 SN3386516923022E2E,,20.00,505.56
115,,,LXZ1J23Y08M89VP9R4,,,
116,06 Dec 21,DEB,MARTIN MCCOLL CD 2817\n04DEC21,13.94,,491.62
117,,,,"Soon, you'll notice differences when you're sh...",,
118,K,Be ready for,,,,
119,,,,"haven't got your latest phone numbers, or you ...",,
120,,changes when you,,,,


In [74]:
# Set the first row as the header (column names)
df.columns = df.iloc[0]
# Remove the first row from the DataFrame
df = df[1:].reset_index(drop=True)



In [75]:
df

111,Date,Pmnt,Details,Money Out (£),Money In (£),Balance (£)
0,,Type,,,,
1,03 Dec 21,,STATEMENT OPENING BALANCE,,,485.56
2,06 Dec 21,FPI,TAPPILY\n04DEC21 SN3386516923022E2E,,20.00,505.56
3,,,LXZ1J23Y08M89VP9R4,,,
4,06 Dec 21,DEB,MARTIN MCCOLL CD 2817\n04DEC21,13.94,,491.62
...,...,...,...,...,...,...
112,31 Dec 21,FPO,MISS M NEWTON-WYE MUM 31DEC21 12:16,100.00,,556.24
113,31 Dec 21,FPO,MR STEFAN J CROXAL HAYLEY 31DEC21 13:14,90.00,,466.24
114,31 Dec 21,CPT,LNK ASDA FROME2 CD 2817 31DEC21,30.00,,436.24
115,31 Dec 21,FPI,TAPPILY DF2136501974620700,,100.00,536.24


In [76]:
# Drop the 'Pmnt' column
df = df.drop(columns=['Pmnt'])

In [77]:
# Define a function to check if the value in the "Date" column is a valid date
def is_date(value):
    if pd.isna(value) or value == "":
        return False
    try:
        pd.to_datetime(value, format="%d %b %y", errors="raise")
        return True
    except ValueError:
        return False

# Apply the function to filter only rows where "Date" is a valid date and not empty
df = df[df['Date'].apply(is_date)]

# Reset index after filtering, if needed
df = df.reset_index(drop=True)

In [78]:
df

111,Date,Details,Money Out (£),Money In (£),Balance (£)
0,03 Dec 21,STATEMENT OPENING BALANCE,,,485.56
1,06 Dec 21,TAPPILY\n04DEC21 SN3386516923022E2E,,20.00,505.56
2,06 Dec 21,MARTIN MCCOLL CD 2817\n04DEC21,13.94,,491.62
3,06 Dec 21,NETFLIX.COM CD 2817\n04DEC21,13.99,,477.63
4,06 Dec 21,ASDA STORES LTD 58 CD 2817\n04DEC21,19.65,,457.98
...,...,...,...,...,...
81,31 Dec 21,MISS M NEWTON-WYE MUM 31DEC21 12:16,100.00,,556.24
82,31 Dec 21,MR STEFAN J CROXAL HAYLEY 31DEC21 13:14,90.00,,466.24
83,31 Dec 21,LNK ASDA FROME2 CD 2817 31DEC21,30.00,,436.24
84,31 Dec 21,TAPPILY DF2136501974620700,,100.00,536.24


In [79]:
# Assuming df is your DataFrame
df = df.rename(columns={
    'Date': 'Date',
    'Details': 'Description',
    'Money Out (£)': 'Withdraw',
    'Money In (£)': 'Credit',
    'Balance (£)': 'Balance',
    # Add more columns as needed
})

In [80]:
file_path = "F:/python project/Lloyds-bank-statement-basic-1.csv"
df.to_csv(file_path, index=False)

In [146]:
import camelot
import pandas as pd

def process_bank_statement_classic(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 112 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:112])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned

# Example usage:
pdf_path = "left/Lloyds-bank-statement-classic-1.pdf"
cleaned_df = process_bank_statement_classic(pdf_path)
cleaned_df.head()


112,Date,Description,Withdraw,Credit,Balance
0,01 Oct 21,STATEMENT OPENING BALANCE,,,8.31
1,04 Oct 21,TOWNSEND T N TANIA\n02OCT21,,50.0,58.31
2,04 Oct 21,K MORRIS\n02OCT21 309089 51561060,50.0,,8.31
3,04 Oct 21,K MORRIS\n02OCT21 309089 51561060,,35.0,43.31
4,04 Oct 21,PPOINT_*CLIFTON FO CD 5659\n02OCT21,2.29,,41.02


In [81]:


# Lloyds-bank-statement-classic-1

import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("left/Lloyds-bank-statement-classic-1.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)

                   0    1                                  2                3  \
0                                                                 Page 1 of 7   
1                                        Statement number 28                    
2                                                 Issue date  1 November 2021   
3     MRS K N MORRIS                          Write to us at    Box 3 BX1 1LT   
4    94 CLIFTON ROAD                                                            
..               ...  ...                                ...              ...   
359        01 Nov 21  TFR  K MORRIS\n31OCT21 309089 51561060             2.00   
360        01 Nov 21  TFR           K MORRIS 309089 51561060                    
361        01 Nov 21  DEB          SAINSBURYS S/MKTS CD 5659             0.21   
362        01 Nov 21               STATEMENT CLOSING BALANCE         3,926.48   
363   Payment types:                                                            

            4     5  
0    

In [83]:
df = df_cleaned
df = df.drop(df.index[:112])
df.head(20)

Unnamed: 0,0,1,2,3,4,5
112,Date,Pmnt,Details,Money Out (£),Money In (£),Balance (£)
113,,Type,,,,
114,01 Oct 21,,STATEMENT OPENING BALANCE,,,8.31
115,04 Oct 21,FPI,TOWNSEND T N TANIA\n02OCT21,,50.00,58.31
116,,,RP4679961877514300,,,
117,04 Oct 21,TFR,K MORRIS\n02OCT21 309089 51561060,50.00,,8.31
118,04 Oct 21,TFR,K MORRIS\n02OCT21 309089 51561060,,35.00,43.31
119,04 Oct 21,DEB,PPOINT_*CLIFTON FO CD 5659\n02OCT21,2.29,,41.02
120,,,,"Soon, you'll notice differences when you're sh...",,
121,K,Be ready for,,,,


In [84]:
# Set the first row as the header (column names)
df.columns = df.iloc[0]
# Remove the first row from the DataFrame
df = df[1:].reset_index(drop=True)

In [85]:
# Drop the 'Pmnt' column
df = df.drop(columns=['Pmnt'])

In [86]:
# Define a function to check if the value in the "Date" column is a valid date
def is_date(value):
    if pd.isna(value) or value == "":
        return False
    try:
        pd.to_datetime(value, format="%d %b %y", errors="raise")
        return True
    except ValueError:
        return False

# Apply the function to filter only rows where "Date" is a valid date and not empty
df = df[df['Date'].apply(is_date)]

# Reset index after filtering, if needed
df = df.reset_index(drop=True)

In [87]:
# Assuming df is your DataFrame
df = df.rename(columns={
    'Date': 'Date',
    'Details': 'Description',
    'Money Out (£)': 'Withdraw',
    'Money In (£)': 'Credit',
    'Balance (£)': 'Balance',
    # Add more columns as needed
})

In [88]:
df

112,Date,Description,Withdraw,Credit,Balance
0,01 Oct 21,STATEMENT OPENING BALANCE,,,8.31
1,04 Oct 21,TOWNSEND T N TANIA\n02OCT21,,50.00,58.31
2,04 Oct 21,K MORRIS\n02OCT21 309089 51561060,50.00,,8.31
3,04 Oct 21,K MORRIS\n02OCT21 309089 51561060,,35.00,43.31
4,04 Oct 21,PPOINT_*CLIFTON FO CD 5659\n02OCT21,2.29,,41.02
...,...,...,...,...,...
199,01 Nov 21,SAINSBURYS S/MKTS CD 5659\n31OCT21,1.00,,2.23
200,01 Nov 21,K MORRIS\n31OCT21 309089 51561060,2.00,,0.23
201,01 Nov 21,K MORRIS 309089 51561060,,1.00,1.23
202,01 Nov 21,SAINSBURYS S/MKTS CD 5659,0.21,,1.02


In [89]:
file_path = "F:/python project/Lloyds-bank-statement-classic-1.csv"
df.to_csv(file_path, index=False)

In [148]:
import camelot
import pandas as pd

def process_bank_statement_club_lloyds(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 127 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:127])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned

# Example usage:
pdf_path = "left/Lloyds-bank-statement-club-lloyds-1.pdf"
cleaned_df = process_bank_statement_club_lloyds(pdf_path)
cleaned_df.head()


127,Date,Description,Withdraw,Credit,Balance
0,19 Aug 21,STATEMENT OPENING BALANCE,,,5.72
1,20 Aug 21,100K84T0Z DWP UC,,1377.27,1382.99
2,20 Aug 21,D HUNTER-REED FROM DEC,,20.0,1402.99
3,20 Aug 21,R PICKFORD FROM MUM 20AUG21 21:33,20.0,,1382.99
4,20 Aug 21,TOMBOLA CD 1211,,20.0,1402.99


In [135]:


# Lloyds-bank-statement-club-lloyds-1

import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("left/Lloyds-bank-statement-club-lloyds-1.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)




                    0    1                          2                  3  \
0                                                            Page 1 of 5   
1                                 Statement number 11                      
2                                          Issue date  17 September 2021   
3    MRS H E PICKFORD                  Write to us at      Box 3 BX1 1LT   
4          CREST BANK                                                      
..                ...  ...                        ...                ...   
274         15 Sep 21  DEB     TESCO PFS 3424 CD 1211               0.70   
275         15 Sep 21  DEB     TESCO PFS 3424 CD 1211              23.35   
276         16 Sep 21  DEB     TESCO PFS 3424 CD 1211               2.95   
277         17 Sep 21  DEB     TESCO PFS 3424 CD 1211              12.60   
278         17 Sep 21       STATEMENT CLOSING BALANCE           2,424.84   

            4      5  
0         NaN    NaN  
1         NaN    NaN  
2         NaN    N

In [136]:
df = df_cleaned
df = df.drop(df.index[:127])
df.head(20)

Unnamed: 0,0,1,2,3,4,5
127,Date,Pmnt,Details,Money Out (£),Money In (£),Balance (£)
128,,Type,,,,
129,19 Aug 21,,STATEMENT OPENING BALANCE,,,5.72
130,20 Aug 21,BGC,100K84T0Z DWP UC,,1377.27,1382.99
131,20 Aug 21,FPI,D HUNTER-REED FROM DEC,,20.00,1402.99
132,,,300000000805867786,,,
133,20 Aug 21,FPO,R PICKFORD FROM MUM 20AUG21 21:33,20.00,,1382.99
134,20 Aug 21,DEB,TOMBOLA CD 1211,,20.00,1402.99
135,,,,"Soon, you'll notice differences when you're sh...",,
136,K,Be ready for,,,,


In [137]:
# Set the first row as the header (column names)
df.columns = df.iloc[0]
# Remove the first row from the DataFrame
df = df[1:].reset_index(drop=True)

# Drop the 'Pmnt' column
df = df.drop(columns=['Pmnt'])

In [138]:
# Define a function to check if the value in the "Date" column is a valid date
def is_date(value):
    if pd.isna(value) or value == "":
        return False
    try:
        pd.to_datetime(value, format="%d %b %y", errors="raise")
        return True
    except ValueError:
        return False

# Apply the function to filter only rows where "Date" is a valid date and not empty
df = df[df['Date'].apply(is_date)]

# Reset index after filtering, if needed
df = df.reset_index(drop=True)

In [139]:
# Assuming df is your DataFrame
df = df.rename(columns={
    'Date': 'Date',
    'Details': 'Description',
    'Money Out (£)': 'Withdraw',
    'Money In (£)': 'Credit',
    'Balance (£)': 'Balance',
    # Add more columns as needed
})

In [140]:
df

127,Date,Description,Withdraw,Credit,Balance
0,19 Aug 21,STATEMENT OPENING BALANCE,,,5.72
1,20 Aug 21,100K84T0Z DWP UC,,1377.27,1382.99
2,20 Aug 21,D HUNTER-REED FROM DEC,,20.00,1402.99
3,20 Aug 21,R PICKFORD FROM MUM 20AUG21 21:33,20.00,,1382.99
4,20 Aug 21,TOMBOLA CD 1211,,20.00,1402.99
...,...,...,...,...,...
113,15 Sep 21,TESCO PFS 3424 CD 1211,0.70,,72.21
114,15 Sep 21,TESCO PFS 3424 CD 1211,23.35,,48.86
115,16 Sep 21,TESCO PFS 3424 CD 1211,2.95,,45.91
116,17 Sep 21,TESCO PFS 3424 CD 1211,12.60,,33.31


In [98]:
file_path = "F:/python project/Lloyds-bank-statement-club-lloyds-1.csv"
df.to_csv(file_path, index=False)

In [150]:
import camelot
import pandas as pd

def process_bank_statement_standard_saver(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 138 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:138])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned

# Example usage:
pdf_path = "left/Lloyds-bank-statement-standard-saver-1.pdf"
cleaned_df = process_bank_statement_standard_saver(pdf_path)
cleaned_df.head()


138,Date,Description,Withdraw,Credit,Balance
0,01 Oct 21,STATEMENT OPENING BALANCE,,,20.0
1,04 Oct 21,D BUTLER 02OCT21 773308 33679260,5.0,,15.0
2,04 Oct 21,D BUTLER 02OCT21 773308 33679260,15.0,,0.0
3,04 Oct 21,D BUTLER 03OCT21 773308 33679260,,13.9,13.9
4,04 Oct 21,D BUTLER 03OCT21 773308 33679260,13.0,,0.9


In [121]:


# Lloyds-bank-statement-standard-saver-1

import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("left/Lloyds-bank-statement-standard-saver-1.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)

                   0    1                                 2                3  \
0                                                                Page 1 of 5   
1                                       Statement number 30                    
2                                                Issue date  1 November 2021   
3      MR D B BUTLER                         Write to us at    Box 3 BX1 1LT   
4    7 COSGROVE WALK                                                           
..               ...  ...                               ...              ...   
292        01 Nov 21  TFR  D BUTLER 30OCT21 773308 33679260            20.00   
293        01 Nov 21  TFR  D BUTLER 30OCT21 773308 33679260            10.00   
294        01 Nov 21  TFR          D BUTLER 773308 33679260                    
295        01 Nov 21  TFR          D BUTLER 773308 33679260            12.00   
296        01 Nov 21              STATEMENT CLOSING BALANCE         2,115.30   

            4      5  
0         NaN   

In [122]:
df = df_cleaned
df = df.drop(df.index[:138])
df.head(20)

Unnamed: 0,0,1,2,3,4,5
138,Date,Pmnt,Details,Money Out (£),Money In (£),Balance (£)
139,,Type,,,,
140,01 Oct 21,,STATEMENT OPENING BALANCE,,,20.00
141,04 Oct 21,TFR,D BUTLER 02OCT21 773308 33679260,5.00,,15.00
142,04 Oct 21,TFR,D BUTLER 02OCT21 773308 33679260,15.00,,0.00
143,04 Oct 21,TFR,D BUTLER 03OCT21 773308 33679260,,13.90,13.90
144,04 Oct 21,TFR,D BUTLER 03OCT21 773308 33679260,13.00,,0.90
145,11 Oct 21,TFR,D BUTLER 773308 33679260,,264.00,264.90
146,11 Oct 21,TFR,D BUTLER 773308 33679260,24.00,,240.90
147,11 Oct 21,TFR,D BUTLER 773308 33679260,20.00,,220.90


In [105]:
# Set the first row as the header (column names)
df.columns = df.iloc[0]
# Remove the first row from the DataFrame
df = df[1:].reset_index(drop=True)

# Drop the 'Pmnt' column
df = df.drop(columns=['Pmnt'])

In [106]:
# Define a function to check if the value in the "Date" column is a valid date
def is_date(value):
    if pd.isna(value) or value == "":
        return False
    try:
        pd.to_datetime(value, format="%d %b %y", errors="raise")
        return True
    except ValueError:
        return False

# Apply the function to filter only rows where "Date" is a valid date and not empty
df = df[df['Date'].apply(is_date)]

# Reset index after filtering, if needed
df = df.reset_index(drop=True)

In [107]:
# Assuming df is your DataFrame
df = df.rename(columns={
    'Date': 'Date',
    'Details': 'Description',
    'Money Out (£)': 'Withdraw',
    'Money In (£)': 'Credit',
    'Balance (£)': 'Balance',
    # Add more columns as needed
})

In [108]:
df

138,Date,Description,Withdraw,Credit,Balance
0,01 Oct 21,STATEMENT OPENING BALANCE,,,20.00
1,04 Oct 21,D BUTLER 02OCT21 773308 33679260,5.00,,15.00
2,04 Oct 21,D BUTLER 02OCT21 773308 33679260,15.00,,0.00
3,04 Oct 21,D BUTLER 03OCT21 773308 33679260,,13.90,13.90
4,04 Oct 21,D BUTLER 03OCT21 773308 33679260,13.00,,0.90
...,...,...,...,...,...
139,01 Nov 21,D BUTLER 30OCT21 773308 33679260,20.00,,10.00
140,01 Nov 21,D BUTLER 30OCT21 773308 33679260,10.00,,0.00
141,01 Nov 21,D BUTLER 773308 33679260,,12.00,12.00
142,01 Nov 21,D BUTLER 773308 33679260,12.00,,0.00


In [109]:
file_path = "F:/python project/Lloyds-bank-statement-standard-saver-1.csv"
df.to_csv(file_path, index=False)

In [151]:
import camelot
import pandas as pd

def process_bank_statement_standard_saver_2(pdf_path):
    # Step 1: Extract tables from all pages using the 'stream' flavor
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")

    # Step 2: Combine all tables into a single DataFrame
    dfs = [table.df for table in tables]
    df = pd.concat(dfs, ignore_index=True)

    # Step 3: Clean the DataFrame by dropping any empty rows or columns
    df_cleaned = df.dropna(how='all', axis=0)
    df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

    # Step 4: Reset index after cleaning
    df_cleaned.reset_index(drop=True, inplace=True)

    # Step 5: Drop the first 138 rows (adjust as needed)
    df_cleaned = df_cleaned.drop(df_cleaned.index[:138])

    # Step 6: Set the first row as the header (column names) and reset index
    df_cleaned.columns = df_cleaned.iloc[0]
    df_cleaned = df_cleaned[1:].reset_index(drop=True)

    # Step 7: Drop the 'Pmnt' column (if it exists)
    if 'Pmnt' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['Pmnt'])

    # Step 8: Define a function to check if the value in the "Date" column is a valid date
    def is_date(value):
        if pd.isna(value) or value == "":
            return False
        try:
            pd.to_datetime(value, format="%d %b %y", errors="raise")
            return True
        except ValueError:
            return False

    # Step 9: Filter rows where "Date" is a valid date
    df_cleaned = df_cleaned[df_cleaned['Date'].apply(is_date)]

    # Step 10: Reset the index after filtering
    df_cleaned = df_cleaned.reset_index(drop=True)

    # Step 11: Rename columns to standardize
    df_cleaned = df_cleaned.rename(columns={
        'Date': 'Date',
        'Details': 'Description',
        'Money Out (£)': 'Withdraw',
        'Money In (£)': 'Credit',
        'Balance (£)': 'Balance',
        # Add more columns as needed
    })

    return df_cleaned

# Example usage:
pdf_path = "left/Lloyds-bank-statement-standard-saver-2.pdf"
cleaned_df = process_bank_statement_standard_saver_2(pdf_path)
cleaned_df.head()


138,Date,Description,Withdraw,Credit,Balance
0,01 Sep 21,STATEMENT OPENING BALANCE,,,10.47
1,02 Sep 21,D BUTLER 773308 33679260,,24.0,34.47
2,02 Sep 21,D BUTLER 773308 33679260,,30.0,64.47
3,02 Sep 21,D BUTLER 773308 33679260,14.0,,50.47
4,02 Sep 21,D BUTLER 773308 33679260,,7.0,57.47


In [110]:


# Lloyds-bank-statement-standard-saver-2

import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("left/Lloyds-bank-statement-standard-saver-2.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)

                   0    1                          2               3  \
0                                                        Page 1 of 6   
1                                Statement number 29                   
2                                         Issue date  1 October 2021   
3      MR D B BUTLER                  Write to us at   Box 3 BX1 1LT   
4    7 COSGROVE WALK                                                   
..               ...  ...                        ...             ...   
341        01 Oct 21  TFR   D BUTLER 773308 33679260          180.00   
342        01 Oct 21  TFR   D BUTLER 773308 33679260                   
343        01 Oct 21  TFR   D BUTLER 773308 33679260                   
344        01 Oct 21       STATEMENT CLOSING BALANCE        1,884.35   
345   Payment types:                                                   

            4      5  
0         NaN    NaN  
1         NaN    NaN  
2         NaN    NaN  
3         NaN    NaN  
4         NaN    NaN

In [111]:
df = df_cleaned
df = df.drop(df.index[:138])
df.head(20)

Unnamed: 0,0,1,2,3,4,5
138,Date,Pmnt,Details,Money Out (£),Money In (£),Balance (£)
139,,Type,,,,
140,01 Sep 21,,STATEMENT OPENING BALANCE,,,10.47
141,02 Sep 21,TFR,D BUTLER 773308 33679260,,24.00,34.47
142,02 Sep 21,TFR,D BUTLER 773308 33679260,,30.00,64.47
143,02 Sep 21,TFR,D BUTLER 773308 33679260,14.00,,50.47
144,02 Sep 21,TFR,D BUTLER 773308 33679260,,7.00,57.47
145,02 Sep 21,TFR,D BUTLER 773308 33679260,2.00,,55.47
146,02 Sep 21,TFR,D BUTLER 773308 33679260,8.00,,47.47
147,03 Sep 21,TFR,D BUTLER 773308 33679260,20.00,,27.47


In [112]:
# Set the first row as the header (column names)
df.columns = df.iloc[0]
# Remove the first row from the DataFrame
df = df[1:].reset_index(drop=True)

# Drop the 'Pmnt' column
df = df.drop(columns=['Pmnt'])

In [113]:
# Define a function to check if the value in the "Date" column is a valid date
def is_date(value):
    if pd.isna(value) or value == "":
        return False
    try:
        pd.to_datetime(value, format="%d %b %y", errors="raise")
        return True
    except ValueError:
        return False

# Apply the function to filter only rows where "Date" is a valid date and not empty
df = df[df['Date'].apply(is_date)]

# Reset index after filtering, if needed
df = df.reset_index(drop=True)

In [114]:
# Assuming df is your DataFrame
df = df.rename(columns={
    'Date': 'Date',
    'Details': 'Description',
    'Money Out (£)': 'Withdraw',
    'Money In (£)': 'Credit',
    'Balance (£)': 'Balance',
    # Add more columns as needed
})

In [115]:
df

138,Date,Description,Withdraw,Credit,Balance
0,01 Sep 21,STATEMENT OPENING BALANCE,,,10.47
1,02 Sep 21,D BUTLER 773308 33679260,,24.00,34.47
2,02 Sep 21,D BUTLER 773308 33679260,,30.00,64.47
3,02 Sep 21,D BUTLER 773308 33679260,14.00,,50.47
4,02 Sep 21,D BUTLER 773308 33679260,,7.00,57.47
...,...,...,...,...,...
180,01 Oct 21,D BUTLER 773308 33679260,,4.00,186.00
181,01 Oct 21,D BUTLER 773308 33679260,180.00,,6.00
182,01 Oct 21,D BUTLER 773308 33679260,,4.00,10.00
183,01 Oct 21,D BUTLER 773308 33679260,,10.00,20.00


In [116]:
file_path = "F:/python project/Lloyds-bank-statement-standard-saver-2.csv"
df.to_csv(file_path, index=False)