# OCR Matching

In this notebook we are going to build a tool that taken as input a medical prescription in pdf,jpg format or a string gives me as output the list of facilities that perform medical analysis.

To extract information from prescriptions we will use OCR technology while to match the words in the prescription with those in the dataset we will use the fuzzywuzzy library.

OCR pdf: https://pypi.org/project/PyPDF2/

OCR jpg: https://www.jaided.ai/easyocr/tutorial/

FuzzyWuzzy: https://pypi.org/project/fuzzywuzzy/

In [None]:
pip install  pypdf langchain tika typing fuzzywuzzy pymupdf easyocr

In [None]:
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from tika import parser
from typing import List
import re
from typing import List, Tuple
from fuzzywuzzy import fuzz, process
import fitz
import easyocr
import numpy as np
import gradio as gr


In [None]:
dataset = pd.read_csv('https://github.com/SimoneFarallo/public_and_social_services/raw/main/data/final_data_cleaned.csv')
dataset = dataset.drop(dataset.columns[0], axis=1)
dataset.head()

## OCR Matching for pdf

I create a function that taken as input a pdf, gives me as output a list

In [None]:
# Function to match  prescriptions with a dataset 
def match_recipe_to_dataset_pdf(pdf_file: str, dataset: pd.DataFrame, threshold: int = 90) -> List[Tuple]:

    # Check if pdf_file is a string, if not raise TypeError
    if not isinstance(pdf_file, str):
        raise TypeError("The PDF file path must be a string.")
    
    # Check if dataset is a pandas dataframe, if not raise TypeError
    if not isinstance(dataset, pd.DataFrame):
        raise TypeError("The dataset must be a pandas.DataFrame object.")

    # Extract text from pdf file using fitz library
    pdf_file = fitz.open(pdf_file)
    text = ''
    for page in pdf_file:
        text += page.get_text()

    # Find words of interest using regex 
    words = re.findall(r' - ([A-Za-z\s]+)', text) # Initial regex function to extract words from recipe (starting with ' - ([A-Z]+))'

    # Match words with words in dataset 'Codice prestazione ambulatoriale' column using fuzzywuzzy
    matches = []
    for word in words:
        if not isinstance(word, str):
            raise TypeError("The word of interest must be a string.") # Function to raise error for non-string input
        scores = process.extract(word, dataset['Codice prestazione ambulatoriale'], scorer=fuzz.token_set_ratio, limit=1) # Use fuzzywuzzy library to score the similarity between two strings 
        filtered_scores = [score for score in scores if score[1] >= threshold] # Filter scores based on the threshold value
        for score in filtered_scores:
            rows = dataset[dataset['Codice prestazione ambulatoriale'] == score[0]].iterrows() # Select the output rows based on the matching score
            matches.extend([(word, row[1], score[1]) for row in rows]) # Add the matched words, output rows, and score to the list 'matches'

    # Sort the results based on score
    matches_sorted = sorted(matches, key=lambda x: x[2], reverse=True)
    
    # Return a list of tuples containing matched word, output row, and score
    return matches_sorted

In [None]:
cd C:\Users\Simone\Documents\Desktop\public_and_social_services\ricette


In [None]:
match_recipe_to_dataset_pdf('ricetta_1.pdf',dataset)

## OCR Matching for png

I create a function that taken as input a png, gives me as output a list

In [None]:
reader = easyocr.Reader(['it'])

In [None]:
# function to match  prescriptions with a dataset from png
def match_recipe_to_dataset_jpg(file: str, dataset: pd.DataFrame, threshold: int = 90) -> List[Tuple]:
    # Check the input arguments types
    if not isinstance(file, str):
        raise TypeError("The PDF file path must be a string.")
    if not isinstance(dataset, pd.DataFrame):
        raise TypeError("The dataset must be a pandas.DataFrame object.")
    
    # Set-up the reader object and extract the text from the PDF file
    recipe = reader.readtext(file)
    result_string = ""
    for i in range(len(recipe)):
        element = recipe[i][1] # strings are in this position
        result_string += element
        result_string += " "
   
    # Clean the text by removing the dots
    result_string = result_string.replace(".","") 
    # Set-up the regex for matching codes and descriptions
    regex = r"\((\d\w*)\)([A-Z ]+)"
    matches = re.findall(regex,result_string)
    output = []
    # Extract the descriptions from the matches
    for match in matches:
      output.append(match[1])

    # Save the relevant words to match them with the 'Codice prestazione ambulatoriale' column
    words = output 

    # Match the words with the dataset using fuzzywuzzy
    matches = []
    for word in words:
        # Check the input argument type
        if not isinstance(word, str):
            raise TypeError("The word of interest must be a string.")
        # Match the word with the 'Codice prestazione ambulatoriale' column using fuzzywuzzy
        scores = process.extract(word, dataset['Codice prestazione ambulatoriale'], scorer=fuzz.token_set_ratio, limit=1) # Here we perform fuzzy matching; the 'limit' parameter limits the number of matching results
        # Filter out the results below the threshold
        filtered_scores = [score for score in scores if score[1] >= threshold]
        for score in filtered_scores:
            # Identify the rows based on the matching code
            rows = dataset[dataset['Codice prestazione ambulatoriale'] == score[0]].iterrows()
            # Append the matches to the output list
            matches.extend([(word, row[1], score[1]) for row in rows])

    # Sort the matches in descending order based on the score
    matches_sorted = sorted(matches, key=lambda x: x[2], reverse=True)

    return matches_sorted

In [None]:
match_recipe_to_dataset_jpg('ricetta_2.png', dataset)


## Matching tool

Taking the functions created earlier as a reference, we create a function that takes both pdf and png as input, also modify the output to be a dataframe so that it can be filtered later

In [None]:
# function to match  prescriptions with a dataset and create a new dataframe
def match_recipe_to_dataset_and_create_df(file: str, dataset: pd.DataFrame) -> pd.DataFrame:
    # raise error if file path is not a string
    if not isinstance(file, str):
        raise TypeError("File path must be a string.")
    # raise error if dataset is not a pandas DataFrame
    if not isinstance(dataset, pd.DataFrame):
        raise TypeError("Dataset must be a pandas.DataFrame.")

    # if file is a pdf, extract text from file
    if file.endswith(".pdf"):
        pdf_file = fitz.open(file) # open pdf file
        text = ''
        for page in pdf_file:
            text += page.get_text() # extract text from each page

        # identify words of interest using regular expression
        words = re.findall(r' - ([A-Za-z\s]+)', text)

    # if file is a png, read text from image using OCR
    elif file.endswith(".png"):
        ricetta = reader.readtext(file) # use tesseract to read text
        result_string = ""
        for i in range(len(ricetta)):
            elemento = ricetta[i][1] #string values are stored in this position in the tuple
            result_string += elemento # concatenate all strings
            result_string += " " # add space between each string
        
        result_string = result_string.replace(".","") # remove dots
        regex = r"\((\d\w*)\)([A-Z ]+)"
        matches = re.findall(regex, result_string) # identify string patterns using regex
        output = []
        for match in matches:
            output.append(match[1]) # append string to output list

        words = output

    # raise error if file format is not supported
    else:
        raise ValueError("File format not supported. Please use a pdf or png file.")

    # match words with values in Codice prestazione ambulatoriale column of dataset using fuzzywuzzy library
    matches = []
    for word in words:
        if not isinstance(word, str):
            raise TypeError("Word of interest must be a string.")
        scores = process.extract(word, dataset['Codice prestazione ambulatoriale'], scorer=fuzz.token_set_ratio, limit=1)
        filtered_scores = [score for score in scores if score[1] >= 90]
        for score in filtered_scores:
            rows = dataset[dataset['Codice prestazione ambulatoriale'] == score[0]].iterrows()
            matches.extend([(word,row[1], score[1]) for row in rows])

    # sort matches by score in descending order
    matches_sorted = sorted(matches, key=lambda x: x[2], reverse=True)
    
    # create empty dictionary to populate with data
    data = {}

    # iterate through list of tuples
    for row in matches_sorted:
        # extract category name and associated row data
        category, row_data, _ = row
        # iterate through columns in row data
        for col, val in row_data.items():
            # create key in dictionary if it doesn't exist already
            if col not in data:
                data[col] = []
            # append value to list of values associated with key
            data[col].append(val)
        # if a column is missing from current row, add a None value
        for col in data.keys():
            if col not in row_data:
                data[col].append(None)

    # create new DataFrame from dictionary
    df = pd.DataFrame(data)

    return df

In [None]:
match_recipe_to_dataset_and_create_df('ricetta_3.png', dataset)

In [None]:
output_ricetta = match_recipe_to_dataset_and_create_df('ricetta_3.pdf', dataset)
output_ricetta.head()

In [None]:
#Function to filter the dataset
#Is possible add more columns to filter, for now we use only 2
def filter_data(data_frame, comune=None, risposta_strutt= None):
    if comune is not None and risposta_strutt is not None:
        filtered_df = data_frame[(data_frame['Comune struttura'] == comune) & (data_frame['Struttura privata'] == risposta_strutt)]
    elif comune is not None:
        filtered_df = data_frame[data_frame['Comune struttura'] == comune]
    elif risposta_strutt is not None:
        filtered_df = data_frame[data_frame['Struttura privata'] == risposta_strutt]
    else:
        filtered_df = data_frame.copy()

    return filtered_df

In [None]:
filter_data(output_ricetta,'BERGAMO')

In [None]:
#Try to filter for Comune = Milano and Struttre private = No
filter_data(output_ricetta,'BERGAMO','Sì')

# Filter dataset with strings

This function does the same thing as the others, but as input it takes a string 

In [None]:
#Function to search structures directly with performance names
def search_words(words):
    matches = []
    data = {}

    for word in words:
        if not isinstance(word, str): #Check if the word is a string, otherwise raise a TypeError
            raise TypeError("The word of interest must be a string.")
        scores = process.extract(word, dataset['Codice prestazione ambulatoriale'], scorer=fuzz.token_set_ratio, limit=1) #Extract matching scores between the word and the dataset
        filtered_scores = [score for score in scores if score[1] >= 70] #Filter scores based on a threshold of 70%
        for score in filtered_scores:
            rows = dataset[dataset['Codice prestazione ambulatoriale'] == score[0]].iterrows() #Get the rows of the dataset that match the word
            matches.extend([(word,row[1], score[1]) for row in rows]) #Extend the matches list with a tuple of the word, the row data, and the matching score
            for row in matches:
                category, row_data, _ = row
                for col, val in row_data.items():
                    if col not in data:
                        data[col] =[]
                    data[col].append(val) #Add the values of therow to the respective column in the data dictionary
                for col in data.keys():
                    if col not in row_data:
                        data[col].append(None) #If a column is missing in the row, add None as a placeholder

    df = pd.DataFrame(data) #Create a pandas DataFrame from the data dictionary
    return df

In [None]:
output_query = search_words(['calcitonina','emocromo'])
output_query.head()

In [None]:
#Try to filter for Bergamo and private strcture
filter_data(output_query,'BERGAMO','Sì')

# Interface gradio

In [None]:
import gradio as gr

In [None]:
# Interfaccia per ocr tool
def match_recipe_to_dataset_interface(
    file: gr.inputs.File,
    comune: str = None,
    risposta_strutt: str = None
):
    # Ottieni il percorso del file
    file_path = file.name

    # Esegui la tua funzione originale con il percorso del file (stringa) e il DataFrame
    df = match_recipe_to_dataset_and_create_df(file_path, dataset)

    # Filtra il DataFrame se i parametri di filtraggio sono specificati
    if comune is not None or risposta_strutt is not None:
        df = filter_data(df, comune, risposta_strutt)

    # Restituisci il risultato come output dell'interfaccia
    return df

# Definisci gli input per l'interfaccia utente
file_input = gr.inputs.File(label="File")
comune_input = gr.inputs.Textbox(label="Comune")
risposta_strutt_input = gr.inputs.Textbox(label="Struttura privata")

# Definisci l'interfaccia utente utilizzando la tua funzione di Gradio e gli input definiti
interface = gr.Interface(
    fn=match_recipe_to_dataset_interface,
    inputs=[file_input, comune_input, risposta_strutt_input],
    outputs=["dataframe"],
    title="Matching Recipe to Dataset",
    description="Insert here your recipe and discover structure in Lombardy.",
    outputs_labels=["Matched Data", "Filtered Data"]
)

# Avvia l'interfaccia utente
interface.launch(share=True)


In [None]:
def search_words_interface(words):
    try:
        df = search_words(words)
        unique_values = df
        return unique_values
    except TypeError as e:
        return str(e)

# Creazione dell'interfaccia Gradio
iface = gr.Interface(fn=search_words_interface, inputs="text", outputs="dataframe")

# Avvia l'interfaccia Gradio
iface.launch()

In [22]:
demo = gr.Interface(fn=search_words, inputs="text", outputs="dataframe",debug=True)
demo.launch()

  demo = gr.Interface(fn=search_words, inputs="text", outputs="dataframe",debug=True)


Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.




']
