# This is a project that extract and analyze Vietnam Bank Statement 



In [1]:
import pdfplumber
import re
from collections import defaultdict
import csv
import gc
import os
import concurrent.futures
import threading
lock=threading.Lock()

In [2]:
pdf_file_path = "data/Thong tin ung ho qua TSK VCB 0011001932418 tu 01.09 den10.09.2024.pdf"
csv_file_path = 'export/vcb_multithread.csv'

In [3]:
# Directory where individual CSVs will be saved
output_dir = "export/vcb_csvs"
os.makedirs(output_dir, exist_ok=True)

In [4]:
csv_headers = ['date', 'transaction_code', 'amount', 'transaction_detail']

In [5]:
def export_csv(page_number, data):
    output_csv_path = os.path.join(output_dir, f"transactions_page_{page_number}.csv")
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
        writer.writeheader()
        for transaction in data:
            writer.writerow(transaction)
    return output_csv_path

In [6]:
def transform_into_transactions(page, use_multithread=True):
    print('Processing page {}'.format(page.page_number))
    transactions = []
    data = defaultdict(str) 
    if use_multithread:
        lock.acquire()
        text_chunks = page.extract_text().split('\n')
        lock.release()
    else:
        text_chunks = page.extract_text().split('\n')
    start = False
    order = 0
    date = None
    for text_chunk in text_chunks:
        #print(text_chunk)
        # End to get data
        if text_chunk == 'Postal address: Telex : (0805) 411504 VCB - VT':
            break
        # Begin to get data
        if not start and text_chunk == 'Số CT/ Doc No':
            start = True
        elif start:
            match_date_format = re.fullmatch(r'(\d{2}/\d{2}/2024)', text_chunk)
            if match_date_format:
                # Reset order because there is new transaction row
                order = 0
                # Check new transaction or not
                if data['transaction_code']:
                    data['date'] = date
                    transactions.append(data)
                    # Reset json
                    data = defaultdict(str)
                date = match_date_format.group(0)
            elif order == 0:
                # Get amount and transaction detail by space separator
                amount, transaction_detail = text_chunk.split(' ', 1)
                # Remove dot letter
                data['amount'] = amount.replace(".","")
                data['transaction_detail'] = transaction_detail
                order += 1
            elif order == 1:
                data['transaction_code'] = text_chunk
                order += 1
            else:
                data['transaction_detail'] += ' ' + text_chunk
        
    if data['transaction_code']:
        data['date'] = date
        transactions.append(data)
    
    if use_multithread:
        output_csv_path = export_csv(page.page_number, transactions)
        page.close()
        return output_csv_path
    else:
        #gc.collect()
        return transactions
    
    

In [7]:
def write_to_csv(pdf_file, csv_file, headers):
    pdf = pdfplumber.open(pdf_file)
    with open(csv_file, 'w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=headers)
        writer.writeheader()
        for page in pdf.pages:
            transactions = transform_into_transactions(page, use_multithread=False)
            if len(transactions) > 0:
                for transaction in transactions:
                    writer.writerow(transaction)
            transactions.clear()
            print('Processed page {}'.format(page.page_number))
            page.close()

In [8]:
def process_pdf_in_threads(pdf_file_path):
    output_csv_paths = []
    
    with pdfplumber.open(pdf_file_path) as pdf:
        # Process each page in parallel using ThreadPoolExecutor
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for page_number, page in enumerate(pdf.pages):
                futures.append(executor.submit(transform_into_transactions, page))
            
            # Collect all output CSV paths
            for future in concurrent.futures.as_completed(futures):
                output_csv_paths.append(future.result())
    
    return output_csv_paths

In [9]:
def merge_csvs(output_csv_paths, final_csv_path):
    with open(final_csv_path, 'w', newline='', encoding='utf-8') as final_csv:
        writer = csv.DictWriter(final_csv, fieldnames=csv_headers)
        writer.writeheader()  # Write header once
        
        # Merge each individual CSV file
        for csv_path in output_csv_paths:
            with open(csv_path, 'r', newline='', encoding='utf-8') as csv_file:
                reader = csv.DictReader(csv_file)
                for row in reader:
                    writer.writerow(row)

In [10]:
def extract_multiple_thread():
    # Process PDF pages in parallel and save them as CSVs
    output_csv_paths = process_pdf_in_threads(pdf_file_path)
    
    # Merge all CSVs into a single final CSV
    merge_csvs(output_csv_paths, csv_file_path)
    
    # Final garbage collection
    gc.collect()
    
    print("Processing and merging complete!")

In [11]:
#write_to_csv(pdf_file_path, csv_file_path, csv_headers)

In [12]:
extract_multiple_thread()

Processing page 1Processing page 2

Processing page 3
Processing page 4
Processing page 5
Processing page 6
Processing page 7
Processing page 8
Processing page 9
Processing page 10
Processing page 11
Processing page 12
Processing page 13
Processing page 14
Processing page 15
Processing page 16
Processing page 17
Processing page 18
Processing page 19
Processing page 20
Processing page 21
Processing page 22
Processing page 23
Processing page 24
Processing page 25
Processing page 26
Processing page 27
Processing page 28
Processing page 29
Processing page 30
Processing page 31
Processing page 32
Processing page 33
Processing page 34
Processing page 35
Processing page 36
Processing page 37
Processing page 38
Processing page 39
Processing page 40
Processing page 41
Processing page 42
Processing page 43
Processing page 44
Processing page 45
Processing page 46
Processing page 47
Processing page 48
Processing page 49
Processing page 50
Processing page 51
Processing page 52
Processing page 53
Pr