In [None]:
import re
import pandas as pd

# Load the previously extracted data from Excel
df_table = pd.read_excel("extracted_table.xlsx", header=None)

# Skip the first two rows and extract the remaining data
df_table_remaining = df_table.iloc[2:]

# Function to process each row, handling the required merging logic
def process_row(row):
    # Find the last occurrence of a cell with a dollar sign ($)
    dollar_idx = None
    for idx, cell in reversed(list(enumerate(row))):
        if isinstance(cell, str) and '$' in cell:
            dollar_idx = idx
            break
    
    if dollar_idx is None:
        # No dollar sign found, return the row as is
        return row
    
    # Combine all cells after the last $ until the last two cells
    combined_cell = "_".join(row[dollar_idx + 1:-2])
    
    # Keep the last two cells as separate columns
    last_two_cells = row[-2:]
    
    # Construct the new row
    new_row = list(row[:dollar_idx + 1]) + [combined_cell] + last_two_cells
    
    return new_row

# Apply the processing to each row
processed_data = [process_row(row) for row in df_table_remaining.values]

# Convert the processed data into a DataFrame
df_cleaned = pd.DataFrame(processed_data)

# Save the cleaned and split data to a new Excel file
df_cleaned.to_excel("cleaned_extracted_table_remaining_corrected.xlsx", index=False, header=False)

print("Data extraction and splitting complete. Saved to cleaned_extracted_table_remaining_corrected.xlsx")


In [None]:
######working

import cv2
import pytesseract
from pytesseract import Output
import pandas as pd
import numpy as np

# Load the image
image_path = '3.jpeg'
image = cv2.imread(image_path)

# Check if the image was successfully loaded
if image is None:
    print(f"Error: Unable to load image at path {image_path}")
    exit(1)

# Preprocess the image
def preprocess_image(img):
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharp = cv2.filter2D(gray, -1, kernel)
    
    # Denoise the image
    denoised = cv2.fastNlMeansDenoising(sharp, h=30)
    
    # Resize the image
    height, width = denoised.shape
    resized = cv2.resize(denoised, (width * 3, height * 3))
    
    # Apply both adaptive and global thresholding
    adaptive_thresh = cv2.adaptiveThreshold(resized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                            cv2.THRESH_BINARY, 11, 2)
    _, global_thresh = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Combine both thresholding results
    combined_thresh = cv2.bitwise_or(adaptive_thresh, global_thresh)
    
    return combined_thresh

# Preprocess the image
thresh = preprocess_image(image)

# Use Tesseract to perform OCR with custom configurations
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config)

# Extract the OCR results into a DataFrame
df = pd.DataFrame(data)

# Filter out non-table data
table_data = df[df['text'].str.strip().astype(bool)]

# Initialize variables to store row data
rows = []
current_row = []
current_line_num = table_data['line_num'].min()

# Iterate through the table data
for i, row in table_data.iterrows():
    if row['line_num'] != current_line_num:
        if current_row:
            rows.append(' '.join(current_row))
        current_row = []
        current_line_num = row['line_num']
    
    # Concatenate text within the same line
    current_row.append(row['text'])

# Append the last row
if current_row:
    rows.append(' '.join(current_row))

# Create a DataFrame from the list of rows
df_table = pd.DataFrame(rows)

# Ensure DataFrame has columns and rename them if needed
# Assuming the first row of `rows` is the header
if not df_table.empty:
    header = df_table.iloc[0]
    df_table = df_table[1:]
    df_table.columns = header

# Save to Excel
df_table.to_excel("extracted_table.xlsx", index=False)

print("Data extraction complete. Saved to extracted_table.xlsx")
