In [5]:
import json

import numpy as np
import plotly.express as px
import pandas as pd
from nltk.corpus import words
import re

# Ensure the words corpus is downloaded
import nltk
nltk.download('words')

# Create a set of English words for recognition
english_words = set(words.words())

# Define a set of known brand names
brand_names = {
    "tesco", "costa", "paddy", "sainsburys", "nike", "starbucks", "mcdonalds", "apple", "amazon", "google", "adidas"
}

# Define a set of known UK city names
uk_city_names = {
    "london", "manchester", "birmingham", "leeds", "glasgow", "sheffield", "bristol", "coventry", "nottingham", "cardiff",
    "edinburgh", "liverpool", "newcastle", "southampton", "brighton", "derby", "oxford", "cambridge", "plymouth", "stoke"
}

config = {
    "bank_statements": {
        "barclays_student": {
            "identifiers": ["march", "april", "may"],
            "extraction_method": "pdf_plumber"
        },
        "barclays": {
            "identifiers": ["march", "april", "may"],
            "extraction_method": "pdf_plumber"
        },
        "first_direct": {
            "identifiers": ["march", "april", "may"],
            "extraction_method": "pdf_plumber"
        },
        "halifax": {
            "identifiers": ["march", "april", "may"],
            "extraction_method": "pytesseract"
        },
        "lloyds": {
            "identifiers": ["september"],
            "extraction_method": "pytesseract"
        },
        "monzo": {
            "identifiers": ["november", "3_months"],
            "extraction_method": "pdf_plumber"
        },
    },
    "payslips": {
        "payslip": {
            "identifiers": ["jake"],
            "extraction_method": "pdf_plumber"
        },
    },
}

results = []

# Loop through each document type and its templates and identifiers
for document_type, templates_and_identifiers in config.items():
    for template_name, identifiers in templates_and_identifiers.items():
        for identifier in identifiers["identifiers"]:
            # Load the response.json file
            file_path = f"text_extraction/{document_type}/{template_name}/{template_name}_{identifier}/response.json"
            with open(file_path, "r") as file:
                data = json.load(file)

            # Split the text data into words and remove non-word characters, ensuring all characters are alphabetic
            pytesseract_words = set(re.sub(r'\W+', '', word).lower() for text in data["pytesseract"] for word in text.split() if word.isalpha())
            pdf_plumber_words = set(re.sub(r'\W+', '', word).lower() for text in data["pdf_plumber"] for word in text.split() if word.isalpha())

            # Calculate the number of recognized English words in pdf_plumber
            recognized_pdf_plumber_words = pdf_plumber_words.intersection(english_words)
            pdf_plumber_recognized_count = len(recognized_pdf_plumber_words)

            # Calculate the number of recognized English words in pytesseract
            recognized_pytesseract_words = pytesseract_words.intersection(english_words)
            pytesseract_recognized_count = len(recognized_pytesseract_words)

            # Calculate the number of recognized brand names in pdf_plumber
            recognized_pdf_plumber_brands = pdf_plumber_words.intersection(brand_names)
            pdf_plumber_brand_count = len(recognized_pdf_plumber_brands)

            # Calculate the number of recognized brand names in pytesseract
            recognized_pytesseract_brands = pytesseract_words.intersection(brand_names)
            pytesseract_brand_count = len(recognized_pytesseract_brands)

            # Calculate the number of recognized UK city names in pdf_plumber
            recognized_pdf_plumber_cities = pdf_plumber_words.intersection(uk_city_names)
            pdf_plumber_city_count = len(recognized_pdf_plumber_cities)

            # Calculate the number of recognized UK city names in pytesseract
            recognized_pytesseract_cities = pytesseract_words.intersection(uk_city_names)
            pytesseract_city_count = len(recognized_pytesseract_cities)

            words_data = {
                "pdf_plumber_words": list(pdf_plumber_words),
                "pytesseract_words": list(pytesseract_words),
                "pdf_plumber_recognized_count": pdf_plumber_recognized_count,
                "pytesseract_recognized_count": pytesseract_recognized_count,
                "pdf_plumber_brand_count": pdf_plumber_brand_count,
                "pytesseract_brand_count": pytesseract_brand_count,
                "pdf_plumber_city_count": pdf_plumber_city_count,
                "pytesseract_city_count": pytesseract_city_count,
            }

            words_file_path = f"text_extraction/{document_type}/{template_name}/{template_name}_{identifier}/words.json"
            with open(words_file_path, "w") as words_file:
                json.dump(words_data, words_file, indent=4)

            # Calculate the ratio of recognized words to total words for comparison
            pdf_plumber_ratio = pdf_plumber_recognized_count / len(pdf_plumber_words) if pdf_plumber_words else 0
            pytesseract_ratio = pytesseract_recognized_count / len(pytesseract_words) if pytesseract_words else 0

            # Calculate the ratio of recognized brand names to total words for comparison
            pdf_plumber_brand_ratio = pdf_plumber_brand_count / len(pdf_plumber_words) if pdf_plumber_words else 0
            pytesseract_brand_ratio = pytesseract_brand_count / len(pytesseract_words) if pytesseract_words else 0

            # Calculate the ratio of recognized city names to total words for comparison
            pdf_plumber_city_ratio = pdf_plumber_city_count / len(pdf_plumber_words) if pdf_plumber_words else 0
            pytesseract_city_ratio = pytesseract_city_count / len(pytesseract_words) if pytesseract_words else 0

            results.append([pdf_plumber_ratio, pytesseract_ratio, pdf_plumber_brand_ratio, pytesseract_brand_ratio, pdf_plumber_city_ratio, pytesseract_city_ratio, f"{document_type} {template_name} {identifier}"])

# Create a DataFrame for Plotly
results_df = pd.DataFrame(results, columns=['PDF Plumber', 'Pytesseract', 'PDF Plumber Brand Ratio', 'Pytesseract Brand Ratio', 'PDF Plumber City Ratio', 'Pytesseract City Ratio', 'Document Info'])

# Create a Plotly bar chart
fig = px.bar(results_df, 
             x='Document Info', 
             y=['PDF Plumber', 'Pytesseract', 'PDF Plumber Brand Ratio', 'Pytesseract Brand Ratio', 'PDF Plumber City Ratio', 'Pytesseract City Ratio'], 
             title='Ratio of Recognized English Words, Brand Names, and UK City Names in PDF Plumber and Pytesseract',
             labels={'value': 'Ratio', 'variable': 'Extraction Method'},
             barmode='group')

fig.show()


[nltk_data] Downloading package words to
[nltk_data]     /Users/chrislittle/nltk_data...
[nltk_data]   Package words is already up-to-date!
