In [7]:
import cv2
import numpy as np
import pytesseract
import csv
import os

# Folder with bar chart images
image_folder = "data/output_folder/bar_charts/"
output_folder = "data/output_folder/bar_charts/csv/"
os.makedirs(output_folder, exist_ok=True)

img_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

def process_bar_chart(image_path, output_csv_path):
    img = cv2.imread(image_path)
    if img is None:
        print(f"Failed to load {image_path}")
        return

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # (You may adapt pre-processing for your images below)
    edges = cv2.Canny(gray, 50, 150)
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    bars = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 10 and h > 20:  # basic bar size filter
            bars.append((x, y, w, h))
    bars = sorted(bars, key=lambda b: b[0])  # left-to-right

    # Simple bottom/x OCR for categories (improve as needed for your charts)
    cat_crop = img[img.shape[0]-40: , :] # bottom strip for x-labels
    cat_text = pytesseract.image_to_string(cat_crop, config="--psm 7")
    categories = [line.strip() for line in cat_text.split('\n') if line.strip()]

    # Y-axis OCR (leftmost region)
    y_crop = img[:, :70] # tweak for your y-axis labels
    y_text = pytesseract.image_to_string(y_crop, config="--psm 6")
    y_labels = [line.strip() for line in y_text.split('\n') if line.strip()]
    try:
        min_val, max_val = float(y_labels[0]), float(y_labels[-1])
    except:
        min_val, max_val = 0, 10

    chart_height = img.shape[0] - 60 # subtract bottom margin
    def pix_to_val(y_bar):
        # Map pixel Y to axis value (origin at top-left)
        frac = 1 - (y_bar / chart_height)
        return min_val + frac*(max_val - min_val)

    with open(output_csv_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["Category", "Bar Index", "Value"])
        for i, bar in enumerate(bars):
            x, y, w, h = bar
            value = pix_to_val(y + h)
            cat = categories[i % len(categories)] if categories else f"Bar{i+1}"
            writer.writerow([cat, i, round(value,2)])

for fname in img_files:
    img_path = os.path.join(image_folder, fname)
    out_csv = os.path.join(output_folder, fname.rsplit('.', 1)[0] + ".csv")
    process_bar_chart(img_path, out_csv)
    print(f"Processed {fname}")

print("Done extracting bar chart data for all images.")


Processed IMG-2.png
Processed IMG-6.png
Done extracting bar chart data for all images.
