In [None]:
import os
import pandas as pd
import pdfplumber
import re
from tqdm import tqdm  # Import tqdm for the progress bar
import warnings

warnings.simplefilter("ignore", category=UserWarning)

def extract_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join([page.extract_text() for page in pdf.pages])
            if text.strip():
                # Clean up the text (remove extra line breaks, unwanted spaces)
                cleaned_text = re.sub(r'\s+', ' ', text.strip())
                return cleaned_text
            else:
                return None
    except Exception as e:
        print(f"Error extracting from PDF {file_path}: {e}")
        return None

def extract_from_excel(file_path):
    try:
        df = pd.read_excel(file_path, sheet_name=None)
        all_text = ""
        for sheet_name, sheet_df in df.items():
            # Flatten any merged cells by forward filling values
            sheet_df = sheet_df.ffill()  # Replace fillna with ffill directly
            sheet_text = sheet_df.to_string(index=False)  # Remove indices for cleaner output
            if sheet_text.strip():
                all_text += sheet_text + "\n"
        if all_text.strip():
            cleaned_text = re.sub(r'\s+', ' ', all_text.strip())  # Clean up unwanted spaces
            return cleaned_text
        else:
            return None
    except Exception as e:
        print(f"Error extracting from Excel {file_path}: {e}")
        return None

def extract_data_from_folders(folder_paths):
    data = []
    files_to_process = []

    # Collect files from all specified folders
    for folder_path in folder_paths:
        for root, dirs, files in os.walk(folder_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                if file_path.lower().endswith((".pdf", ".xlsx", ".xls")):  # Case-insensitive handling
                    files_to_process.append(file_path)

    # Using tqdm to show a progress bar while processing files
    for file_path in tqdm(files_to_process, desc="Processing Files", unit="file"):
        if file_path.lower().endswith(".pdf"):
            text = extract_from_pdf(file_path)
            if text:
                data.append({"filename": file_path, "source_format": "pdf", "content": text})
        elif file_path.lower().endswith((".xlsx", ".xls")):
            text = extract_from_excel(file_path)
            if text:
                data.append({"filename": file_path, "source_format": "xlsx", "content": text})

    return pd.DataFrame(data)

# folders:
folder_paths = [
    r"C:\Users\spide\OneDrive\Desktop\Bachlorz\2022 - data", 
    r"C:\Users\spide\OneDrive\Desktop\Bachlorz\Tenders"
]
df_data = extract_data_from_folders(folder_paths)

if not df_data.empty:
    print(f"Extracted data from {len(df_data)} files.")
    print(df_data.head())

    # Save the DataFrame to a CSV file
    output_file = "extracted_data.csv"
    df_data.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")
else:
    print("No data extracted.")


In [None]:
import os
import pandas as pd
import pdfplumber
import re
from tqdm import tqdm  # Import tqdm for the progress bar
import warnings

# Ignore warnings for cleaner output
warnings.simplefilter("ignore", category=UserWarning)

# Function to extract text from PDF
def extract_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join([page.extract_text() for page in pdf.pages])
            if text.strip():
                # Clean up the text (remove extra line breaks, unwanted spaces)
                cleaned_text = re.sub(r'\s+', ' ', text.strip())
                return cleaned_text
            else:
                return None
    except Exception as e:
        print(f"Error extracting from PDF {file_path}: {e}")
        return None

# Function to extract text from Excel
def extract_from_excel(file_path):
    try:
        df = pd.read_excel(file_path, sheet_name=None)
        all_text = ""
        for sheet_name, sheet_df in df.items():
            # Flatten any merged cells by forward filling values
            sheet_df = sheet_df.ffill()  # Replace fillna with ffill directly
            sheet_text = sheet_df.to_string(index=False)  # Remove indices for cleaner output
            if sheet_text.strip():
                all_text += sheet_text + "\n"
        if all_text.strip():
            cleaned_text = re.sub(r'\s+', ' ', all_text.strip())  # Clean up unwanted spaces
            return cleaned_text
        else:
            return None
    except Exception as e:
        print(f"Error extracting from Excel {file_path}: {e}")
        return None

# Function to extract data from multiple folders
def extract_data_from_folders(folder_paths):
    data = []
    files_to_process = []

    # Iterate over each folder path
    for folder_path in folder_paths:
        print(f"Checking folder: {folder_path}")  # Debugging line
        if os.path.exists(folder_path):  # Check if the folder exists
            for root, dirs, files in os.walk(folder_path):
                print(f"Scanning folder: {root}")  # Debugging line
                for file_name in files:
                    file_path = os.path.join(root, file_name)
                    print(f"Found file: {file_path}")  # Debugging line
                    
                    # Check for supported file types (PDF, Excel)
                    if file_path.lower().endswith((".pdf", ".xlsx", ".xls")):  # Case-insensitive handling
                        files_to_process.append(file_path)
        else:
            print(f"Folder not found: {folder_path}")  # If folder is missing

    # Start processing files with tqdm progress bar
    print(f"Total files to process: {len(files_to_process)}")
    
    # Use tqdm to display the progress bar during processing
    for file_path in tqdm(files_to_process, desc="Processing Files", unit="file"):
        if file_path.lower().endswith(".pdf"):
            text = extract_from_pdf(file_path)
            if text:
                data.append({"filename": file_path, "source_format": "pdf", "content": text})
        elif file_path.lower().endswith((".xlsx", ".xls")):
            text = extract_from_excel(file_path)
            if text:
                data.append({"filename": file_path, "source_format": "xlsx", "content": text})

    return pd.DataFrame(data)

# Define the folder paths
folder_paths = [
    r"C:\Users\spide\OneDrive\Desktop\Bachlorz\2022 - data", 
    r"C:\Users\spide\OneDrive\Desktop\Bachlorz\Tenders"
]

# Extract data from both folders
df_data = extract_data_from_folders(folder_paths)

# If files were successfully extracted, print a summary and save to CSV
if not df_data.empty:
    print(f"Extracted data from {len(df_data)} files.")
    print(df_data.head())

    # Save the DataFrame to a CSV file
    output_file = "extracted_data.csv"
    df_data.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")
else:
    print("No data extracted.")
