In [None]:
import pdfplumber
import pandas as pd
import re
import os

In [None]:
pdf_file_path = "../amex/data/unlocked/2025-02-27.pdf"

In [None]:
# Check if the file exists
if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"The file {pdf_file_path} was not found in the /data folder.")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                text += f"\n--- Page {i + 1} ---\n{page_text}\n"
    return text

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_file_path)
print("Raw Extracted Text:")
print(pdf_text)
print("\n" + "="*50 + "\n")

# Split the text into pages
pages = pdf_text.split("\n--- Page ")[1:]  # Split by page markers
for i, page in enumerate(pages):
    pages[i] = "--- Page " + page  # Reattach page marker

# Extract metadata from page 1
metadata = {}
for line in pages[0].splitlines():
    if "Fakturans period" in line:
        metadata["Period"] = line.split(":")[1].strip()
print("Metadata:")
print(metadata)
print("\n" + "="*50 + "\n")

# Initialize lists for transactions
roshan_transactions = []
stephanie_transactions = []

# Process pages for transactions (skip page 1)
for page in pages[1:]:
    print(f"Processing {page.splitlines()[0]}")
    lines = page.splitlines()
    current_section = None
    
    for line in lines:
        line = line.strip()
        print(f"Line: '{line}'")
        
        # Identify section headers
        if "Nya köp för Roshan Talimi" in line:
            current_section = "Roshan"
            print("Entered Roshan section")
            continue
        elif "Nya köp för Stephanie Maria Gardner Extrakort som slutar på 31017" in line:
            current_section = "Stephanie"
            print("Entered Stephanie section")
            continue
        elif "Inbetalningar" in line:
            current_section = None
            print("Skipping Inbetalningar section")
            continue
        
        # Parse transaction lines
        if current_section and re.match(r"\d{2}\.\d{2}\.\d{2}", line):
            parts = re.split(r"\s+", line, maxsplit=2)
            if len(parts) >= 3:
                date = parts[0]
                amount = parts[-1].replace(",", ".").replace(" ", "")
                description = " ".join(parts[1:-1])
                
                try:
                    amount = float(amount)
                    transaction = {"Date": date, "Description": description, "Amount": amount}
                    print(f"Parsed transaction: {transaction}")
                    
                    if current_section == "Roshan":
                        roshan_transactions.append(transaction)
                    elif current_section == "Stephanie":
                        stephanie_transactions.append(transaction)
                except ValueError:
                    print(f"Skipping invalid amount in line: {line}")
    print("\n" + "-"*50 + "\n")

# Convert to pandas DataFrames
roshan_df = pd.DataFrame(roshan_transactions)
stephanie_df = pd.DataFrame(stephanie_transactions)

In [None]:
# Display results
print("Nya köp för Roshan Talimi:")
print(roshan_df)

In [None]:
print("\nNya köp för Stephanie Maria Gardner Extrakort som slutar på 31017:")
print(stephanie_df)

In [None]:
# Check if the file exists
if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"The file {pdf_file_path} was not found in the /data folder.")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file_path):
    text = ""
    with pdfplumber.open(pdf_file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if page_text:
                text += f"\n--- Page {i + 1} ---\n{page_text}\n"
    return text

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_file_path)
print("Raw Extracted Text:")
print(pdf_text)
print("\n" + "="*50 + "\n")

# Split the text into pages
pages = pdf_text.split("\n--- Page ")[1:]  # Split by page markers
for i, page in enumerate(pages):
    pages[i] = "--- Page " + page  # Reattach page marker

# Extract metadata from page 1
metadata = {}
for line in pages[0].splitlines():
    if "Fakturans period" in line:
        metadata["Period"] = line.split(":")[1].strip()
print("Metadata:")
print(metadata)
print("\n" + "="*50 + "\n")

# Initialize lists for transactions
roshan_transactions = []
stephanie_transactions = []

# Process pages for transactions (skip page 1)
for page in pages[1:]:
    print(f"Processing {page.splitlines()[0]}")
    lines = page.splitlines()
    current_section = None
    
    for line in lines:
        line = line.strip()
        print(f"Line: '{line}'")
        
        # Flexible section header detection
        if "Nya köp för Roshan Talimi" in line or "Roshan Talimi" in line:
            current_section = "Roshan"
            print("Entered Roshan section")
            continue
        elif ("Nya köp för Stephanie Maria Gardner" in line or 
              "Stephanie Maria Gardner" in line or 
              "Extrakort som slutar på 31017" in line):
            current_section = "Stephanie"
            print("Entered Stephanie section")
            continue
        elif "Inbetalningar" in line:
            current_section = None
            print("Skipping Inbetalningar section")
            continue
        
        # Broader transaction line detection
        if current_section and re.match(r"(\d{2}\.\d{2}\.\d{2,4}|\d{4}-\d{2}-\d{2})", line):
            parts = re.split(r"\s+", line, maxsplit=2)
            if len(parts) >= 3:
                date = parts[0]
                amount = parts[-1].replace(",", ".").replace(" ", "").replace("SEK", "")
                description = " ".join(parts[1:-1])
                
                try:
                    amount = float(amount)
                    transaction = {"Date": date, "Description": description, "Amount": amount}
                    print(f"Parsed transaction: {transaction}")
                    
                    if current_section == "Roshan":
                        roshan_transactions.append(transaction)
                    elif current_section == "Stephanie":
                        stephanie_transactions.append(transaction)
                except ValueError:
                    print(f"Skipping invalid amount in line: {line}")
            else:
                print(f"Line in section {current_section} doesn’t split into 3+ parts: {line}")
        elif current_section:
            print(f"Line in section {current_section} doesn’t match date pattern: {line}")
    print("\n" + "-"*50 + "\n")

# Convert to pandas DataFrames
roshan_df = pd.DataFrame(roshan_transactions)
stephanie_df = pd.DataFrame(stephanie_transactions)

# Display results
print("Nya köp för Roshan Talimi:")
print(roshan_df)
print("\nNya köp för Stephanie Maria Gardner Extrakort som slutar på 31017:")
print(stephanie_df)

In [None]:
print(roshan_df)

In [None]:
stephanie_df