# Total pages

In [33]:
import camelot
import pandas as pd

# Extract tables from all pages using the 'stream' flavor
tables = camelot.read_pdf("Barclays-1.pdf", pages="all", flavor="stream")

# Combine all tables into a single DataFrame
dfs = [table.df for table in tables]
df = pd.concat(dfs, ignore_index=True)

# Drop any empty rows or columns if needed
df_cleaned = df.dropna(how='all', axis=0)
df_cleaned = df_cleaned.dropna(how='all', axis=1)  # Optionally drop empty columns

# Reset index and print the cleaned DataFrame
df_cleaned.reset_index(drop=True, inplace=True)
print(df_cleaned)


                                                     0  \
0                                                        
1                                                        
2                                                    u   
3                                                        
4                                                        
..                                                 ...   
446  transactions the conversion may happen on the day   
447  the transaction is processed. As this may be a...   
448  two later, the exchange rate may be different ...   
449                                               day.   
450  You'll find a comparison of our exchange rate for   

                                                     1                 2    3  \
0                                              DL1 5YF                          
1                            Your accounts at a glance                          
2                                                           

In [34]:
df_cleaned = df_cleaned[df_cleaned[1].notna() & (df_cleaned[1] != "")]
df_cleaned.shape

(361, 5)

In [42]:
# Drop the top 10 rows and update the DataFrame in place
df_cleaned.drop(index=df_cleaned.index[:8], inplace=True)

In [43]:
df_cleaned.head(50)

Unnamed: 0,0,1,2,3,4
36,Date\nDescription,Money out,Money in,Balance,
38,05 May\nCard Payment to Uber *Eats On 04,17.15,,,
40,Card Payment to Just Eat On 04,20.31,,,
42,Card Payment to Fragrance Direct U,52.94,,,
44,Card Payment to Co-Op Group,8.35,,,
46,Bill Payment to Mrs CL REA,550.00,,,
53,06 May\nCard Payment to Amznmktplace On,16.27,,,
55,Card Payment to W M Morrison,21.42,,,
57,Card Payment to Domino S Pizza On,31.88,,,
60,Date,Description,Money out,Money in,Balance


In [49]:
# Drop columns with index 6 and 7
df_cleaned = df_cleaned.drop([4], axis=1)

# Reset index if needed
df_cleaned.reset_index(drop=True, inplace=True)

In [50]:
df_cleaned.head(10)
df = df_cleaned

In [51]:
df = df[df[0].str.strip() != "Date"]
df

Unnamed: 0,0,1,2,3
0,Date\nDescription,Money out,Money in,Balance
1,05 May\nCard Payment to Uber *Eats On 04,17.15,,
2,Card Payment to Just Eat On 04,20.31,,
3,Card Payment to Fragrance Direct U,52.94,,
4,Card Payment to Co-Op Group,8.35,,
...,...,...,...,...
338,authorised. However for a small number of,Barclays accounts. If you don’t get these mess...,,
339,transactions the conversion may happen on the day,"and you’d like to, or if you do and you’d rath...",,
340,the transaction is processed. As this may be a...,"didn’t, you can call 0345 7 345 345, go to",,
341,"two later, the exchange rate may be different ...","barclays.co.uk, or come into a branch. And if you",,


In [54]:
# Specify new column names
new_column_names = ["Date/Description",  "Money Out", "Money In","Balance"]

# Assign new column names to the DataFrame
df.columns = new_column_names

# Optionally, drop any rows that are not needed (e.g., headers or footers)
df = df.drop(0).reset_index(drop=True)
df

Unnamed: 0,Date/Description,Money Out,Money In,Balance
0,05 May\nCard Payment to Uber *Eats On 04,17.15,,
1,Card Payment to Just Eat On 04,20.31,,
2,Card Payment to Fragrance Direct U,52.94,,
3,Card Payment to Co-Op Group,8.35,,
4,Bill Payment to Mrs CL REA,550.00,,
...,...,...,...,...
331,authorised. However for a small number of,Barclays accounts. If you don’t get these mess...,,
332,transactions the conversion may happen on the day,"and you’d like to, or if you do and you’d rath...",,
333,the transaction is processed. As this may be a...,"didn’t, you can call 0345 7 345 345, go to",,
334,"two later, the exchange rate may be different ...","barclays.co.uk, or come into a branch. And if you",,


In [56]:
# Split the 'Date_Description' column into 'Date' and 'Description'
df[['Date', 'Description']] = df['Date/Description'].str.split('\n', expand=True)

# Drop the original 'Date_Description' column if needed
df.drop(columns=['Date/Description'], inplace=True)

In [61]:
df.head(20)

Unnamed: 0,Money Out,Money In,Balance,Date,Description
0,17.15,,,05 May,Card Payment to Uber *Eats On 04
1,20.31,,,Card Payment to Just Eat On 04,
2,52.94,,,Card Payment to Fragrance Direct U,
3,8.35,,,Card Payment to Co-Op Group,
4,550.00,,,Bill Payment to Mrs CL REA,
5,16.27,,,06 May,Card Payment to Amznmktplace On
6,21.42,,,Card Payment to W M Morrison,
7,31.88,,,Card Payment to Domino S Pizza On,
8,Card Payment to Birkenstock Digita,125.5,,06 May,
9,Germany On 05 May,,,,


In [16]:
df = df_cleaned.drop(0).reset_index(drop=True)
df

Unnamed: 0,0,1,2,3,4,5
0,Date.,Description.,Type.,Money In (£).,Money Out (£).,Balance (£).
1,Date,Description,Type,,Money Out (£),Balance (£)
2,01 Jul 21.,TESCO PAY AT PUMP.,DEB.,,40.00.,-311.39.
3,Date,Description,Type,,Money Out (£),Balance (£)
4,01 Jul 21.,LIDL GB AYR.,DEB.,,3.65.,-315.04.
...,...,...,...,...,...,...
608,30 Jul 21.,DAILY OD INT.,CHG.,,0.33.,-299.54.
609,BGC.,Bank Giro Credit.,BP.,Bill Payments.,CHG. Charge.,
610,DEP.,Deposit.,FEE.,Fixed Service,FPI.\nFaster Payment In.,FPO.
611,MPI.,Mobile Payment In.,,MPO. Mobile Payment Out.,PAY.\nPayment.,SO.


In [23]:
df = df[df[0].str.strip() != "Date"]
df

KeyError: 0

In [20]:
# Set the first row as column names
df.columns = df.iloc[0]

# Drop the first row
df = df[1:]

# Reset index
df.reset_index(drop=True, inplace=True)

# Clean any leading or trailing spaces in column names
df.columns = df.columns.str.strip()

In [24]:
df

Unnamed: 0,01 Jul 21.,TESCO PAY AT PUMP.,DEB.,Unnamed: 4,40.00.,-311.39.
0,01 Jul 21.,LIDL GB AYR.,DEB.,,3.65.,-315.04.
1,01 Jul 21.,GREGGS PLC.,DEB.,,5.90.,-320.94.
2,01 Jul 21.,PPOINT_*WOODFIELD.,DEB.,,7.36.,-328.30.
3,01 Jul 21.,TESCO PFS 4143.,DEB.,,11.45.,-339.75.
4,01 Jul 21.,TESCO PFS 3701.,DEB.,,13.25.,-353.00.
...,...,...,...,...,...,...
314,30 Jul 21.,DAILY OD INT.,CHG.,,0.33.,-299.54.
315,BGC.,Bank Giro Credit.,BP.,Bill Payments.,CHG. Charge.,
316,DEP.,Deposit.,FEE.,Fixed Service,FPI.\nFaster Payment In.,FPO.
317,MPI.,Mobile Payment In.,,MPO. Mobile Payment Out.,PAY.\nPayment.,SO.


In [25]:
# Specify new column names
new_column_names = ["Date", "Description", "Type", "Money In", "Money Out", "Balance"]

# Assign new column names to the DataFrame
df.columns = new_column_names

# Optionally, drop any rows that are not needed (e.g., headers or footers)
df = df.drop(0).reset_index(drop=True)
df

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,01 Jul 21.,GREGGS PLC.,DEB.,,5.90.,-320.94.
1,01 Jul 21.,PPOINT_*WOODFIELD.,DEB.,,7.36.,-328.30.
2,01 Jul 21.,TESCO PFS 4143.,DEB.,,11.45.,-339.75.
3,01 Jul 21.,TESCO PFS 3701.,DEB.,,13.25.,-353.00.
4,01 Jul 21.,EE LIMITED.,DEB.,,80.00.,-433.00.
...,...,...,...,...,...,...
313,30 Jul 21.,DAILY OD INT.,CHG.,,0.33.,-299.54.
314,BGC.,Bank Giro Credit.,BP.,Bill Payments.,CHG. Charge.,
315,DEP.,Deposit.,FEE.,Fixed Service,FPI.\nFaster Payment In.,FPO.
316,MPI.,Mobile Payment In.,,MPO. Mobile Payment Out.,PAY.\nPayment.,SO.


In [26]:
# Remove rows where column 0 has the value "Date" (with inplace=True)
df.drop(df[df['Date'].str.strip() == "Date."].index, inplace=True)
df.drop(df[df['Date'].str.strip() == "Date."].index, inplace=True)
df.drop(df[df['Date'].str.strip() == "Column"].index, inplace=True)
df.drop(df[df['Money In'].str.strip() == "Money In (£)\nblank."].index, inplace=True)
df.drop(df[df['Money Out'].str.strip() == "Money Out (£)\nblank."].index, inplace=True)
# Reset the index to maintain continuous indexing after dropping rows
df.reset_index(drop=True, inplace=True)

In [28]:
df

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
0,01 Jul 21.,GREGGS PLC.,DEB.,,5.90.,-320.94.
1,01 Jul 21.,PPOINT_*WOODFIELD.,DEB.,,7.36.,-328.30.
2,01 Jul 21.,TESCO PFS 4143.,DEB.,,11.45.,-339.75.
3,01 Jul 21.,TESCO PFS 3701.,DEB.,,13.25.,-353.00.
4,01 Jul 21.,EE LIMITED.,DEB.,,80.00.,-433.00.
...,...,...,...,...,...,...
289,30 Jul 21.,DAILY OD INT.,CHG.,,0.33.,-299.54.
290,BGC.,Bank Giro Credit.,BP.,Bill Payments.,CHG. Charge.,
291,DEP.,Deposit.,FEE.,Fixed Service,FPI.\nFaster Payment In.,FPO.
292,MPI.,Mobile Payment In.,,MPO. Mobile Payment Out.,PAY.\nPayment.,SO.


In [29]:
# Function to clean trailing periods from numerical values
def clean_numeric_values(value):
    if isinstance(value, str):
        return value.strip('.')
    return value

# Apply the cleaning function to relevant columns
columns_to_clean = ["Money In", "Money Out", "Balance"]
for column in columns_to_clean:
    df[column] = df[column].apply(clean_numeric_values)

# Convert columns to numeric types if needed
df["Money In"] = pd.to_numeric(df["Money In"], errors='coerce')
df["Money Out"] = pd.to_numeric(df["Money Out"], errors='coerce')
df["Balance"] = pd.to_numeric(df["Balance"], errors='coerce')

In [30]:
df.tail(5)

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
289,30 Jul 21.,DAILY OD INT.,CHG.,,0.33,-299.54
290,BGC.,Bank Giro Credit.,BP.,,,
291,DEP.,Deposit.,FEE.,,,
292,MPI.,Mobile Payment In.,,,,
293,TFR.,Transfer.,,,,


In [32]:
# Function to validate and convert dates
def is_valid_date(date_str):
    try:
        pd.to_datetime(date_str, format='%d %b %y')
        return True
    except ValueError:
        return False

# Remove trailing periods and check date validity
df['Date'] = df['Date'].str.replace('.', '', regex=False)
df['Date_valid'] = df['Date'].apply(is_valid_date)

# Filter out rows with invalid dates
df = df[df['Date_valid']].drop(columns='Date_valid')
df.tail(10)
#df.shape

Unnamed: 0,Date,Description,Type,Money In,Money Out,Balance
280,29 Jul 21,SUBWAY 32619 Carri.,DEB.,,12.57,-349.65
281,29 Jul 21,DAILY OD INT.,CHG.,,0.38,-350.03
282,30 Jul 21,J THOMSON.,TFR.,80.0,,-270.03
283,30 Jul 21,J THOMSON.,TFR.,,80.0,-350.03
284,30 Jul 21,BROOKSON SOLUTIONS.,FPI.,969.39,,619.36
285,30 Jul 21,J THOMSON.,TFR.,90.0,,709.36
286,30 Jul 21,J THOMSON.,TFR.,,,-369.22
287,30 Jul 21,PAYPAL *APPLE.COM/.,DEB.,,7.99,-377.21
288,30 Jul 21,J THOMSON.,TFR.,78.0,,-299.21
289,30 Jul 21,DAILY OD INT.,CHG.,,0.33,-299.54


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("F:/python project/pdf1.csv")

In [7]:
from openai import OpenAI 
import os 
import requests 
from pdf2image import convert_from_bytes 
from PIL import Image 
import base64 
import json
import pytesseract
from io import BytesIO 
import pandas as pd

In [9]:
# Initialize OpenAI client
client = OpenAI(api_key='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')

# Get the completion from the model
response = client.chat.completions.create(
    model="gpt-4o-mini",  # Ensure you have the correct model name
    messages=[{"role": "user", "content": prompt}],
)
result = response.choices[0].message.content
print(result)

To arrive at a lending decision based on the provided data, we need to first determine the disposable amount for the last 3 months. The disposable amount can be computed using the balance and the money out amounts.

Here is how we would assess the data:

1. **Calculate Disposable Amount:** For each month, the disposable amount can be understood as the money available after expenses. We will need to subtract the total money out for that month from the ending balance for the month.

2. **Check Conditions:** 
   - Approve if the disposable amount is more than 1000 for all three months.
   - Decline if the disposable amount is less than 1000 for all three months.
   - Defer to Underwriter if the disposable amount is more than 1000 for one or more months.

Let’s summarize the last three months from the data:

1. **April 2021** (assuming the data includes all transactions):
   - Total money out = sum of all out transactions for April
   - Ending balance = the last balance number for April.



In [None]:
df = df.fillna('')

In [189]:
file_path = "F:/python project/pdf2.csv"

In [191]:
#df.to_csv(file_path, index=False)