<a href="https://colab.research.google.com/github/Venomous000/Numeric-Feature-Based-OCR-Approach/blob/main/PF_Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mouting Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Importing Libraries**

In [None]:
import os
import json
import pandas as pd
import numpy as np

# **Fetching JSON**

In [None]:
# Load the JSON data
with open('/content/drive/MyDrive/PF Assessment/filtered_data.json', 'r') as file:
    data = json.load(file)

# **Feature Calculation**

In [None]:
# Function to calculate width, height, slope, and relative dimensions of each OCR feature
def calculate_features(ocr_data, doc_size):
    features = []
    doc_width, doc_height = doc_size

    for word, coords in ocr_data.items():
        xs = [coord["x"] for coord in coords]
        ys = [coord["y"] for coord in coords]

        # Calculate width and height in terms of pixel coordinates
        width = max(xs) - min(xs)
        height = max(ys) - min(ys)

        # Slope calculation
        slope = np.arctan2(ys[1] - ys[0], xs[1] - xs[0]) if len(xs) > 1 else 0

        # Relative dimensions
        relative_width = width / doc_width
        relative_height = height / doc_height

        # Collect features for each word
        features.append({
            "word": word,
            "width": width,
            "height": height,
            "slope": slope,
            "relative_width": relative_width,
            "relative_height": relative_height
        })

    return features


# **Extract and Organize Features**

In [None]:
# Initialize the DataFrame for storing the extracted features
all_features = []

for doc_id, doc_data in data.items():
    ocr_data = doc_data["ocr"]
    doc_size = doc_data["size"]

    # Extract features
    features = calculate_features(ocr_data, doc_size)

    # Append document information to each feature entry
    for feature in features:
        feature["document_id"] = doc_id
        feature["document_type"] = doc_data["type"]
        all_features.append(feature)

# **Create DataFrame and Add Labels**

In [None]:
# Create a DataFrame and assign category labels (assigning "unknown" for template items as example)
df = pd.DataFrame(all_features)
df["category_label"] = df["word"].apply(lambda x: "unknown" if x.isnumeric() else "text")

# **Save Features to CSV**

In [None]:
# Save to CSV
df.to_csv("extracted_features.csv", index=False)
print("CSV file 'extracted_features.csv' created successfully.")

CSV file 'extracted_features.csv' created successfully.


# **Load and Evaluate CSV Data**

In [None]:
# Load the CSV file with extracted features
csv_file_path = '/content/extracted_features.csv'
df = pd.read_csv(csv_file_path)
print (df.head(50))

              word     width    height     slope  relative_width  \
0            EESTI  0.041992  0.025719 -0.073638        0.000041   
1         VABARIIK  0.141602  0.039334  0.021365        0.000138   
2         Republic  0.098633  0.031770  0.000000        0.000096   
3               of  0.024414  0.031770  0.000000        0.000024   
4          Estonia  0.084961  0.031770  0.000000        0.000083   
5   ISIKUTUNNISTUS  0.248047  0.039334  0.018295        0.000242   
6         Identity  0.084961  0.036309 -0.017805        0.000083   
7             Card  0.052734  0.034796 -0.028681        0.000051   
8    PEREKONNANIME  0.117187  0.021180  0.012909        0.000114   
9                /  0.010742  0.036309  0.000000        0.000010   
10         SURNAME  0.070312  0.021180  0.021513        0.000069   
11          RAGNAR  0.121094  0.034796  0.012493        0.000118   
12         EESNIMI  0.057617  0.019667  0.000000        0.000056   
13           GIVEN  0.042969  0.019667  0.000000

# **Document dimensions as extracted from JSON data**

In [None]:
MAX_DOCUMENT_WIDTH = 1024
MAX_DOCUMENT_HEIGHT = 661
print("Maximum document width: ", MAX_DOCUMENT_WIDTH)
print("Maximum document height: ", MAX_DOCUMENT_HEIGHT)

Maximum document width:  1024
Maximum document height:  661


# **1. Check for missing values**

In [None]:
missing_values = df.isnull().sum()
print("Missing values per column:\n",missing_values)

Missing values per column:
 word               0
width              0
height             0
slope              0
relative_width     0
relative_height    0
document_id        0
document_type      0
category_label     0
dtype: int64


# **2. Consistency checks for relative dimensions**

In [None]:
# These should ideally be between 0 and 1 as they represent proportions
invalid_relative_width = df[(df['relative_width'] < 0) | (df['relative_width'] > 1)]
invalid_relative_height = df[(df['relative_height'] < 0) | (df['relative_height'] > 1)]
print("Entries with invalid relative width: ", len(invalid_relative_width))
print("Entries with invalid relative height: ", len(invalid_relative_height))

Entries with invalid relative width:  0
Entries with invalid relative height:  0


# **3. Logical validation of width and height against maximum document dimensions**

In [None]:
# No individual width or height should exceed the document's width and height
invalid_width = df[df['width'] > MAX_DOCUMENT_WIDTH]
invalid_height = df[df['height'] > MAX_DOCUMENT_HEIGHT]
print("\nEntries with width exceeding document width:", len(invalid_width))
print("Entries with height exceeding document height:", len(invalid_height))


Entries with width exceeding document width: 0
Entries with height exceeding document height: 0


# **4. Outlier Detection for Extreme Slopes**

In [None]:
outliers_slope = df[df['slope'].abs() > 1.5]  # example threshold for slope
print("\nEntries with extreme slopes:", len(outliers_slope))


Entries with extreme slopes: 26


# **Final Summary of Results**

In [None]:
if missing_values.sum() == 0 and len(invalid_relative_width) == 0 and len(invalid_relative_height) == 0 and len(invalid_width) == 0 and len(invalid_height) == 0:
    print("\nData validation passed successfully!")
else:
    print("\nData validation completed with some issues. Review the outputs above for details.")


Data validation passed successfully!
