In [None]:
#Hauptverantwortlichkeiten:
Bearbeitung multimodaler Finanzanfragen (Textfragen, Finanztabellen, PDF-Dateien).
#Abh√§ngigkeiten installieren (nur Jupyter/Google Colab) ===
# !pip install camelot-py[cv] pdfplumber pandas

import os
import glob
import pandas as pd
import camelot
import pdfplumber

# === –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–π ===
pdf_dir = r"C:\Users\sash-\KI_multi-modal-RAG\project\pdfs"  # –ø—É—Ç—å –∫ –ø–∞–ø–∫–µ —Å PDF
output_base_dir = "parsed"
os.makedirs(output_base_dir, exist_ok=True)

# === –ö–æ–º–ø–∞–Ω–∏–∏ ===
known_companies = ["apple", "microsoft", "google", "nvidia", "meta"]

def detect_company(file_name):
    name = file_name.lower()
    for company in known_companies:
        if company in name:
            return company.capitalize()
    return "Unknown"

# === Reinigungstische ===
def clean_financial_df(df):
    df = df.dropna(how='all')  # —É–¥–∞–ª–µ–Ω–∏–µ –ø—É—Å—Ç—ã—Ö —Å—Ç—Ä–æ–∫
    df = df.dropna(axis=1, how='all')  # —É–¥–∞–ª–µ–Ω–∏–µ –ø—É—Å—Ç—ã—Ö –∫–æ–ª–æ–Ω–æ–∫
    df = df.applymap(lambda x: str(x).strip() if pd.notnull(x) else x)  # –æ–±—Ä–µ–∑–∫–∞ –ø—Ä–æ–±–µ–ª–æ–≤
    df = df.replace(r'[\$,()%B]', '', regex=True).replace(',', '', regex=True)
    df = df.apply(pd.to_numeric, errors='ignore')  # –ø–æ–ø—ã—Ç–∫–∞ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞—Ç—å –≤ —á–∏—Å–ª–∞
    return df

# === Alle PDFs durchsuchen ===
pdf_files = glob.glob(os.path.join(pdf_dir, '**', '*.pdf'), recursive=True)

for file_path in pdf_files:
    file_name = os.path.basename(file_path)
    base_name = os.path.splitext(file_name)[0]
    company = detect_company(file_name)

    output_dir = os.path.join(output_base_dir, company)
    os.makedirs(output_dir, exist_ok=True)

    print(f"\nüìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞: {file_name} ‚Üí –ö–æ–º–ø–∞–Ω–∏—è: {company}")

    # === Versuch, den Dateikopf zu lesen ===
    try:
        with open(file_path, "rb") as f:
            content = f.read(1024)
            print(f"üîé Erste 100 Bytes des PDF:: {content[:100]}")
    except Exception as e:
        print(f"‚ùå Fehler beim Lesen der Datei: {e}")
        continue

    # === Camelot: lattice –∏ stream ===
    for flavor in ['lattice', 'stream']:
        try:
            tables = camelot.read_pdf(file_path, pages='all', flavor=flavor)
            print(f"üìä Camelot ({flavor}) gefunden {tables.n} Tabellen")

            for i, table in enumerate(tables):
                df = clean_financial_df(table.df)
                print(df.head())  # Debug-Ausgabe
                csv_name = f"{base_name}_camelot_{flavor}_table{i+1}.csv"
                csv_path = os.path.join(output_dir, csv_name)
                #
                # Versuchen wir, es auf eine Zeitreihe zu reduzieren (wenn m√∂glich)
            for possible_date in df.columns:
                if df[possible_date].astype(str).str.match(r'\d{4}[-/.]\d{2}[-/.]\d{2}').any():
                    df = df.rename(columns={possible_date: 'Date'})
                    break

            for possible_price in df.columns:
                if df[possible_price].dtype == 'float' or df[possible_price].dtype == 'int':
                    if possible_price != 'Date':
                        df = df.rename(columns={possible_price: 'Close'})
                        break

            if 'Date' in df.columns and 'Close' in df.columns:
                df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                df = df[['Date', 'Close']].dropna()

                #
                df.to_csv(csv_path, index=False)
                print(f"‚úÖ Gespeihert (Camelot {flavor}): {csv_path}")
        except Exception as e:
            print(f"‚ö†Ô∏è FEhler Camelot ({flavor}): {e}")

    # === PDFPlumber ===
    try:
        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                print(f"üìÑ PDFPlumber: gefunden{len(tables)} Tabellen auf Seite {page_num+1}")

                for j, table in enumerate(tables):
                    if table:
                        df = pd.DataFrame(table)
                        df = clean_financial_df(df)
                        print(df.head())
                        csv_name = f"{base_name}_plumber_page{page_num+1}_table{j+1}.csv"
                        csv_path = os.path.join(output_dir, csv_name)
                             # Versuchen wir, es auf eine Zeitreihe zu reduzieren (wenn m√∂glich)
                for possible_date in df.columns:
                    if df[possible_date].astype(str).str.match(r'\d{4}[-/.]\d{2}[-/.]\d{2}').any():
                        df = df.rename(columns={possible_date: 'Date'})
                        break

                for possible_price in df.columns:
                    if df[possible_price].dtype == 'float' or df[possible_price].dtype == 'int':
                        if possible_price != 'Date':
                            df = df.rename(columns={possible_price: 'Close'})
                            break

                if 'Date' in df.columns and 'Close' in df.columns:
                    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                    df = df[['Date', 'Close']].dropna()

                #
                df.to_csv(csv_path, index=False)
                print(f"‚úÖ Gespeichert (PDFPlumber): {csv_path}")
    except Exception as e:
        print(f"‚ö†Ô∏è Fehler PDFPlumber: {e}")

# === –ü—Ä–æ–≤–µ—Ä–∫–∞ –∏—Ç–æ–≥–æ–≤ ===
csv_files = glob.glob(os.path.join(output_base_dir, "**", "*.csv"), recursive=True)

print("\nüìÅ –ò—Ç–æ–≥:")
if not csv_files:
    print("‚ùå Dateien nicht gefunden.")
else:
    for f in csv_files:
        print(f"‚úÖ Datei gefunden: {f}")
