In [2]:
import cv2
import pytesseract
from pytesseract import Output
import pandas as pd
import numpy as np

# Load the image
image = cv2.imread('sample_image.png')

# Preprocess the image
def preprocess_image(img):
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
    sharp = cv2.filter2D(gray, -1, kernel)
    
    # Resize the image
    height, width = sharp.shape
    sharp = cv2.resize(sharp, (width*2, height*2))
    
    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(sharp, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                   cv2.THRESH_BINARY, 11, 2)
    
    return thresh

# Preprocess the image
thresh = preprocess_image(image)

# Use Tesseract to perform OCR with custom configurations
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config)

# Extract the OCR results into a DataFrame
df = pd.DataFrame(data)

# Filter out non-table data
table_data = df[df['text'].str.strip().astype(bool)]

# Initialize variables to store row data
rows = []
current_row = []
current_line_num = table_data['line_num'].min()

# Iterate through the table data
for i, row in table_data.iterrows():
    if row['line_num'] != current_line_num:
        if current_row:
            rows.append(' '.join(current_row))
        current_row = []
        current_line_num = row['line_num']
    
    # Concatenate text within the same line
    current_row.append(row['text'])

# Append the last row
if current_row:
    rows.append(' '.join(current_row))

# Create a DataFrame from the list of rows
df_table = pd.DataFrame(rows)

# Ensure DataFrame has columns and rename them if needed
# Assuming the first row of `rows` is the header
if not df_table.empty:
    header = df_table.iloc[0]
    df_table = df_table[1:]
    df_table.columns = header

# Save to Excel
df_table.to_excel("extracted_table.xlsx", index=False)

print("Data extraction complete. Saved to extracted_table.xlsx")


Data extraction complete. Saved to extracted_table.xlsx


In [3]:
import cv2
import pytesseract
from pytesseract import Output
import pandas as pd
import numpy as np

# Load the image
image = cv2.imread('scrap1.jpg')

# Preprocess the image
def preprocess_image(img):
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
    sharp = cv2.filter2D(gray, -1, kernel)
    
    # Resize the image
    height, width = sharp.shape
    sharp = cv2.resize(sharp, (width*2, height*2))
    
    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(sharp, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                   cv2.THRESH_BINARY, 11, 2)
    
    return thresh

# Preprocess the image
thresh = preprocess_image(image)

# Use Tesseract to perform OCR with custom configurations
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config)

# Extract the OCR results into a DataFrame
df = pd.DataFrame(data)

# Filter out non-table data
table_data = df[df['text'].str.strip().astype(bool)]

# Initialize variables to store row data
rows = []
current_row = []
current_line_num = table_data['line_num'].min()

# Iterate through the table data
for i, row in table_data.iterrows():
    if row['line_num'] != current_line_num:
        if current_row:
            rows.append(' '.join(current_row))
        current_row = []
        current_line_num = row['line_num']
    
    # Concatenate text within the same line
    current_row.append(row['text'])

# Append the last row
if current_row:
    rows.append(' '.join(current_row))

# Create a DataFrame from the list of rows
df_table = pd.DataFrame(rows)

# Check if the DataFrame is not empty and has more than one row
if not df_table.empty and len(df_table) > 1:
    # Ensure DataFrame has columns and rename them if needed
    # Assuming the first row of `rows` is the header
    header = df_table.iloc[0]
    df_table = df_table[1:]
    df_table.columns = header
else:
    print("No valid data found to form a table.")

# Save to Excel
df_table.to_excel("extracted_table.xlsx", index=False)

print("Data extraction complete. Saved to extracted_table.xlsx")


Data extraction complete. Saved to extracted_table.xlsx


In [7]:
import cv2
import pytesseract
from pytesseract import Output
import pandas as pd
import numpy as np

# Load the image
image_path = 'sample_image.png'
image = cv2.imread(image_path)

# Check if the image was successfully loaded
if image is None:
    print(f"Error: Unable to load image at path {image_path}")
    exit(1)

# Preprocess the image
def preprocess_image(img):
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharp = cv2.filter2D(gray, -1, kernel)
    
    # Denoise the image
    denoised = cv2.fastNlMeansDenoising(sharp, h=30)
    
    # Resize the image
    height, width = denoised.shape
    resized = cv2.resize(denoised, (width * 3, height * 3))
    
    # Apply both adaptive and global thresholding
    adaptive_thresh = cv2.adaptiveThreshold(resized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                            cv2.THRESH_BINARY, 11, 2)
    _, global_thresh = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Combine both thresholding results
    combined_thresh = cv2.bitwise_or(adaptive_thresh, global_thresh)
    
    return combined_thresh

# Preprocess the image
thresh = preprocess_image(image)

# Use Tesseract to perform OCR with custom configurations
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config)

# Extract the OCR results into a DataFrame
df = pd.DataFrame(data)

# Filter out non-table data
table_data = df[df['text'].str.strip().astype(bool)]

# Initialize variables to store row data
rows = []
current_row = []
current_line_num = table_data['line_num'].min()

# Iterate through the table data
for i, row in table_data.iterrows():
    if row['line_num'] != current_line_num:
        if current_row:
            rows.append(' '.join(current_row))
        current_row = []
        current_line_num = row['line_num']
    
    # Concatenate text within the same line
    current_row.append(row['text'])

# Append the last row
if current_row:
    rows.append(' '.join(current_row))

# Create a DataFrame from the list of rows
df_table = pd.DataFrame(rows)

# Ensure DataFrame has columns and rename them if needed
# Assuming the first row of `rows` is the header
if not df_table.empty:
    header = df_table.iloc[0]
    df_table = df_table[1:]
    df_table.columns = header

# Save to Excel
df_table.to_excel("extracted_table.xlsx", index=False)

print("Data extraction complete. Saved to extracted_table.xlsx")


Data extraction complete. Saved to extracted_table.xlsx


In [4]:
######working

import cv2
import pytesseract
from pytesseract import Output
import pandas as pd
import numpy as np

# Load the image
image_path = '3.jpeg'
image = cv2.imread(image_path)

# Check if the image was successfully loaded
if image is None:
    print(f"Error: Unable to load image at path {image_path}")
    exit(1)

# Preprocess the image
def preprocess_image(img):
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharp = cv2.filter2D(gray, -1, kernel)
    
    # Denoise the image
    denoised = cv2.fastNlMeansDenoising(sharp, h=30)
    
    # Resize the image
    height, width = denoised.shape
    resized = cv2.resize(denoised, (width * 3, height * 3))
    
    # Apply both adaptive and global thresholding
    adaptive_thresh = cv2.adaptiveThreshold(resized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                            cv2.THRESH_BINARY, 11, 2)
    _, global_thresh = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Combine both thresholding results
    combined_thresh = cv2.bitwise_or(adaptive_thresh, global_thresh)
    
    return combined_thresh

# Preprocess the image
thresh = preprocess_image(image)

# Use Tesseract to perform OCR with custom configurations
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config)

# Extract the OCR results into a DataFrame
df = pd.DataFrame(data)

# Filter out non-table data
table_data = df[df['text'].str.strip().astype(bool)]

# Initialize variables to store row data
rows = []
current_row = []
current_line_num = table_data['line_num'].min()

# Iterate through the table data
for i, row in table_data.iterrows():
    if row['line_num'] != current_line_num:
        if current_row:
            rows.append(' '.join(current_row))
        current_row = []
        current_line_num = row['line_num']
    
    # Concatenate text within the same line
    current_row.append(row['text'])

# Append the last row
if current_row:
    rows.append(' '.join(current_row))

# Create a DataFrame from the list of rows
df_table = pd.DataFrame(rows)

# Ensure DataFrame has columns and rename them if needed
# Assuming the first row of `rows` is the header
if not df_table.empty:
    header = df_table.iloc[0]
    df_table = df_table[1:]
    df_table.columns = header

# Save to Excel
df_table.to_excel("extracted_table.xlsx", index=False)

print("Data extraction complete. Saved to extracted_table.xlsx")


Data extraction complete. Saved to extracted_table.xlsx


In [7]:
## working when O separated

import pandas as pd
import re

# Load the previously extracted data from Excel
df_table = pd.read_excel("extracted_table.xlsx", header=None)

# Function to clean and split lines where "O " is present
def clean_and_split(line):
    if isinstance(line, str) and line.startswith("O "):
        # Replace multiple spaces with a single space
        line = re.sub(r'\s+', ' ', line)
        # Split the line by spaces
        split_line = line.split(' ')
        # Join the elements with semicolons
        return ';'.join(split_line)
    return line

# Apply the cleaning function to each row in the DataFrame
df_table_cleaned = df_table[0].apply(clean_and_split)

# Split the semicolon-separated values into separate columns
df_table_split = df_table_cleaned.str.split(';', expand=True)

# Save the cleaned and split data to a new Excel file
df_table_split.to_excel("cleaned_extracted_table.xlsx", index=False)

print("Data cleaning and splitting complete. Saved to cleaned_extracted_table.xlsx")


Data cleaning and splitting complete. Saved to cleaned_extracted_table.xlsx


In [8]:
###working 

import pandas as pd
import re

# Load the previously extracted data from Excel
df_table = pd.read_excel("extracted_table.xlsx", header=None)

# Function to clean and split lines where "O " is present
def clean_and_split(line):
    if isinstance(line, str) and line.startswith("O "):
        # Replace multiple spaces with a single space
        line = re.sub(r'\s+', ' ', line)
        # Split the line by spaces
        split_line = line.split(' ')
        # Join the elements with semicolons
        return ';'.join(split_line)
    return line

# Function to merge lines if a line might be a continuation of the previous line
def merge_lines(data):
    merged_data = []
    for i, line in enumerate(data):
        if i > 0:
            # Check if the current line is a continuation of the previous line
            if isinstance(line, str) and line.islower():
                # Merge with the previous line
                merged_data[-1] += " " + line.strip()
            else:
                merged_data.append(line)
        else:
            merged_data.append(line)
    return merged_data

# Apply the merge_lines function to combine split lines
merged_table = merge_lines(df_table[0].tolist())

# Apply the cleaning function to each row in the merged data
df_table_cleaned = pd.Series(merged_table).apply(clean_and_split)

# Split the semicolon-separated values into separate columns
df_table_split = df_table_cleaned.str.split(';', expand=True)

# Save the cleaned and split data to a new Excel file
df_table_split.to_excel("cleaned_extracted_table.xlsx", index=False)

print("Data cleaning and splitting complete. Saved to cleaned_extracted_table.xlsx")


Data cleaning and splitting complete. Saved to cleaned_extracted_table.xlsx


In [10]:
import pandas as pd
import re

# Load the previously extracted data from Excel
df_table = pd.read_excel("extracted_table.xlsx", header=None)

# Function to clean and split lines where "O " is present
def clean_and_split(line):
    if isinstance(line, str) and line.startswith("O "):
        # Replace multiple spaces with a single space
        line = re.sub(r'\s+', ' ', line)
        # Split the line by spaces
        split_line = line.split(' ')
        # Join the elements with semicolons
        return ';'.join(split_line)
    return line

# Function to merge lines that are likely part of the same header or value
def merge_lines(data):
    merged_data = []
    for i, line in enumerate(data):
        if i > 0:
            # Check if the current line is likely a continuation of the previous line
            if isinstance(line, str) and (line.islower() or line.strip() == ''):
                # Merge with the previous line
                merged_data[-1] += " " + line.strip()
            elif isinstance(data[i-1], str) and data[i-1].islower():
                # Merge with the previous line if the previous line was lowercased
                merged_data[-1] += " " + line.strip()
            else:
                merged_data.append(line)
        else:
            merged_data.append(line)
    return merged_data

# Skip the first two rows and extract the remaining data
df_table_remaining = df_table.iloc[2:]

# Apply the merge_lines function to combine split lines in the remaining data
merged_table = merge_lines(df_table_remaining[0].tolist())

# Apply the cleaning function to each row in the merged data
df_table_cleaned = pd.Series(merged_table).apply(clean_and_split)

# Split the semicolon-separated values into separate columns
df_table_split = df_table_cleaned.str.split(';', expand=True)

# Save the cleaned and split data to a new Excel file
df_table_split.to_excel("cleaned_extracted_table_remaining.xlsx", index=False)

print("Data extraction complete, excluding the first two rows. Saved to cleaned_extracted_table_remaining.xlsx")


Data extraction complete, excluding the first two rows. Saved to cleaned_extracted_table_remaining.xlsx


In [11]:
import pandas as pd
import re

# Load the previously extracted data from Excel
df_table = pd.read_excel("extracted_table.xlsx", header=None)

# Skip the first two rows and extract the remaining data
df_table_remaining = df_table.iloc[2:]

# Convert the DataFrame to a single string to process all rows together
all_text = ' '.join(df_table_remaining[0].dropna().astype(str).tolist())

# Split the text by " O " followed by a space, but keep the delimiter in the result
rows = re.split(r'(O )', all_text)
rows = [''.join(pair) for pair in zip(rows[1::2], rows[2::2])]

# Process each row to split by spaces and clean
def process_row(row):
    row = re.sub(r'\s+', ' ', row).strip()  # Replace multiple spaces with one
    return row.split(' ')

# Apply the processing to each row
processed_data = [process_row(row) for row in rows]

# Convert the processed data into a DataFrame
df_cleaned = pd.DataFrame(processed_data)

# Save the cleaned and split data to a new Excel file
df_cleaned.to_excel("cleaned_extracted_table_remaining_corrected.xlsx", index=False)

print("Data extraction and splitting complete. Saved to cleaned_extracted_table_remaining_corrected.xlsx")


Data extraction and splitting complete. Saved to cleaned_extracted_table_remaining_corrected.xlsx


In [13]:
import pandas as pd
import re

# Load the previously extracted data from Excel
df_table = pd.read_excel("extracted_table.xlsx", header=None)

# Skip the first two rows and extract the remaining data
df_table_remaining = df_table.iloc[2:]

# Convert the DataFrame to a single string to process all rows together
all_text = ' '.join(df_table_remaining[0].dropna().astype(str).tolist())

# Split the text by "O " followed by a space, but keep the delimiter in the result
rows = re.split(r'(O )', all_text)
rows = [''.join(pair) for pair in zip(rows[1::2], rows[2::2])]

# Function to process each row, handling sentences as single units
def process_row(row):
    # The pattern below captures groups of text split by spaces, except when the text is inside quotes.
    pattern = re.compile(r'".+?"|[^"\s]+')
    matches = pattern.findall(row)

    # Strip quotes from the matched sentences
    cleaned_matches = [match.strip('"') for match in matches]
    
    return cleaned_matches

# Apply the processing to each row
processed_data = [process_row(row) for row in rows]

# Convert the processed data into a DataFrame
df_cleaned = pd.DataFrame(processed_data)

# Save the cleaned and split data to a new Excel file
df_cleaned.to_excel("cleaned_extracted_table_remaining_corrected.xlsx", index=False, header=False)

print("Data extraction and splitting complete. Saved to cleaned_extracted_table_remaining_corrected.xlsx")


Data extraction and splitting complete. Saved to cleaned_extracted_table_remaining_corrected.xlsx


In [2]:
import cv2
import pytesseract
from pytesseract import Output
import pandas as pd
import numpy as np


# Load the image
image_path = 'sample_image.png'
image = cv2.imread(image_path)

# Preprocess the image
def preprocess_image(img):
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharp = cv2.filter2D(gray, -1, kernel)
    
    # Resize the image
    height, width = sharp.shape
    sharp = cv2.resize(sharp, (width*2, height*2))
    
    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(sharp, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                   cv2.THRESH_BINARY, 11, 2)
    
    return thresh

# Preprocess the image
thresh = preprocess_image(image)

# Use Tesseract to perform OCR with custom configurations
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config)

# Extract the OCR results into a DataFrame
df = pd.DataFrame(data)

# Filter out non-table data
table_data = df[df['text'].str.strip().astype(bool)]

# Initialize variables to store row data
rows = []
current_row = []
current_line_num = table_data['line_num'].min()

# Iterate through the table data
for i, row in table_data.iterrows():
    if row['line_num'] != current_line_num:
        if current_row:
            rows.append(' '.join(current_row))
        current_row = []
        current_line_num = row['line_num']
    
    # Concatenate text within the same line
    current_row.append(row['text'])

# Append the last row
if current_row:
    rows.append(' '.join(current_row))

# Create a DataFrame from the list of rows
df_table = pd.DataFrame(rows)

# Manually set the header and split rows into columns
if not df_table.empty:
    df_table.columns = ['row']
    df_table = df_table['row'].str.split(expand=True)
    
    # Setting the headers manually based on expected columns
    headers = ['name', 'age', 'position', 'remote worker']
    df_table.columns = headers[:df_table.shape[1]]

# Save to Excel
df_table.to_excel("extracted_table.xlsx", index=False)

print("Data extraction complete. Saved to extracted_table.xlsx")


ValueError: Length mismatch: Expected axis has 7 elements, new values have 4 elements

In [11]:
import cv2
import pytesseract
from pytesseract import Output
import pandas as pd
import numpy as np

# Load the image
image_path = 'sample_image.png'
image = cv2.imread(image_path)

# Preprocess the image
def preprocess_image(img):
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Sharpen the image
    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
    sharp = cv2.filter2D(gray, -1, kernel)
    
    # Resize the image
    height, width = sharp.shape
    sharp = cv2.resize(sharp, (width*2, height*2))
    
    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(sharp, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                   cv2.THRESH_BINARY, 11, 2)
    
    return thresh

# Preprocess the image
thresh = preprocess_image(image)

# Use Tesseract to perform OCR with custom configurations
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, output_type=Output.DICT, config=custom_config)

# Extract the OCR results into a DataFrame
df = pd.DataFrame(data)

# Filter out non-table data
table_data = df[df['text'].str.strip().astype(bool)]

# Initialize variables to store row data
rows = []
current_row = []
current_line_num = table_data['line_num'].min()

# Iterate through the table data
for i, row in table_data.iterrows():
    if row['line_num'] != current_line_num:
        if current_row:
            rows.append(' '.join(current_row))
        current_row = []
        current_line_num = row['line_num']
    
    # Concatenate text within the same line
    current_row.append(row['text'])

# Append the last row
if current_row:
    rows.append(' '.join(current_row))

# Create a DataFrame from the list of rows
df_table = pd.DataFrame(rows)

# Save to Excel without headers
df_table.to_excel("extracted_table_no_header.xlsx", index=False, header=False)

print("Data extraction complete. Saved to extracted_table_no_header.xlsx")


Data extraction complete. Saved to extracted_table_no_header.xlsx
