In [57]:
import pickle
import pytesseract
from layoutparser.ocr import TesseractAgent
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
from IPython.display import display, HTML 
import os


##### Getting all the pickle files
* Getting all the pickle files as it contains text which we extracted from images in layout format
* We had to run the earlier commands in google collab as there is no support for dectron2 by facebook for windows .It mainly supports Linux and Mac

In [58]:
# Define the directory path
directory_path = 'pickle_files/'

# List all items in the directory
all_items = os.listdir(directory_path)

# Filter out directories (optional)
file_names = [f for f in all_items if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('.pkl')]
file_names

['PMC3576793_00004.pkl',
 'PMC3654277_00006.pkl',
 'PMC3777717_00006.pkl',
 'PMC3863500_00003.pkl',
 'PMC3976938_00002.pkl',
 'PMC4027932_00001.pkl',
 'PMC4527132_00004.pkl',
 'PMC4760359_00006.pkl',
 'PMC4954804_00001.pkl']

##### Tesseract
* Here we perform Optical Character Recognition (OCR) on segmented images using Tesseract
* We have to do it for every pickle file
* Mainly we have got different co-ordinates for text according to Layouts which it follows to form paragraphs, sections, tables etc
* We are mainly concerned with text 

In [59]:
ocr_agent = TesseractAgent(languages='eng')

In [61]:
fileText_dict = {}
for file_name in file_names:
    listText = []
    layout =  pickle.load(open(f'{directory_path}{file_name}', 'rb'))
    # Extract text
    for block in layout:
        segment_image = (block
                        .pad(left=5, right=5, top=5, bottom=5)
                        .crop_image(image))
        listText.append(ocr_agent.detect(segment_image))
    fileText_dict[file_name]=listText

In [62]:
fileText_dict

{'PMC3576793_00004.pkl': ['3.8. Allergic Sensitization in Asthma. Fifty-nine patients (46\ndul, 18 cildsen) bad been previously daghosed wih\n‘uthma. The retaining 59 patients had not been diagnosed\nwith sethina Sensation to any allergen was deteed in\n58% of patents with asthma (34/59). Twenty-six (4%)\nof 59 paints wore sensitized to spring pollens (Table 3)\n‘Approtimatly all of the asthma patients (30%; 30/9)\nwere senlzed to perennial allergens. Seven percent of\npatients with asthmns (4/59) were sensitized only to spring.\n',
  'Total igi Eosinophe cer\nik) _propertion (3)\n\nSpring andl olen meio 52409\n\nSpringpllensandperenialalrgens 391467 S405,\nFallen nd prea allergens\n',
  '4. Discussion\n\nAllesglesesitcton, as diagnosed by the serum allergen\nfpecte Igh level does not always correspond with the\npallets symptoms, We found that approximately twice a\nany patients were senalzed ta both spring pollens and\npetenal allergens compared to patente senatied ony to\n{pring poll

In [64]:
import pickle
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
from IPython.display import display, HTML 

# Initialize the model and tokenizer : Here we are using paraphrase-MiniLM-L6-v2 for embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
tokenizer = model.tokenizer

# This is to get the maximum number of tokens a given Transformer can take in
max_tokens = model.get_max_seq_length()

# Function to split text into chunks as if a given parahgraph / a piece of text exceeds the maximum number of tokens a Transformer can accept
def split_into_chunks(text, max_length):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokenizer.convert_tokens_to_string(tokens[i:i + max_length])
        chunks.append(chunk)
    return chunks

# Load data from the dictionary
encoded_texts = []
metadata = []

for imageText_filename, text_list in fileText_dict.items():  # Iterate over items in the dictionary
    document_id = imageText_filename[:-4]  # Extract document ID (remove '.pkl')

    for parahnum,text in enumerate(text_list):
        # Convert non-string text elements
        if not isinstance(text, str):
            if isinstance(text, list):
                text = ' '.join(text)
            else:
                text = str(text)

        # Split text into chunks with space for metadata
        chunks = split_into_chunks(text, max_tokens - 20)

        # Encode chunks with metadata
        for chunk_num, chunk in enumerate(chunks):
            chunk_with_metadata = f"{document_id}_parah{parahnum}_chunk{chunk_num}: {chunk}"
            chunk_embedding = model.encode(chunk_with_metadata)
            encoded_texts.append(chunk_embedding)
            metadata.append({
                "document_id": document_id,
                "chunk_number": chunk_num,
                "text_chunk": chunk,
                "parahnum":parahnum
            })

# Create FAISS index outside the loop for efficiency
embeddings_array = np.vstack(encoded_texts) 
embedding_dim = embeddings_array.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings_array)

# Create DataFrame and display
df_metadata = pd.DataFrame(metadata)
display(HTML(df_metadata.to_html(index=False))) 


Token indices sequence length is longer than the specified maximum sequence length for this model (142 > 128). Running this sequence through the model will result in indexing errors


document_id,chunk_number,text_chunk,parahnum
PMC3576793_00004,0,"3. 8. allergic sensitization in asthma. fifty - nine patients ( 46 dul, 18 cildsen ) bad been previously daghosed wih ‘ uthma. the retaining 59 patients had not been diagnosed with sethina sensation to any allergen was deteed in 58 % of patents with asthma ( 34 / 59 ). twenty - six ( 4 % ) of 59 paints wore sensitized to spring pollens ( table 3 ) ‘ approtimatly all of the asthma patients ( 30 % ; 30 /",0
PMC3576793_00004,1,9 ) were senlzed to perennial allergens. seven percent of patients with asthmns ( 4 / 59 ) were sensitized only to spring.,0
PMC3576793_00004,0,"total igi eosinophe cer ik ) _ propertion ( 3 ) spring andl olen meio 52409 springpllensandperenialalrgens 391467 s405, fallen nd prea allergens",1
PMC3576793_00004,0,"4. discussion allesglesesitcton, as diagnosed by the serum allergen fpecte igh level does not always correspond with the pallets symptoms, we found that approximately twice a any patients were senalzed ta both spring pollens and petenal allergens compared to patente senatied ony to { pring pollens. however many patients were asymplomatie 1g perehnial allergens expose to perennial allergens such ‘ house dist mite and ct and dog",2
PMC3576793_00004,1,"danduf tan important predisposing rise factor for asthita ( 4, previous dlngosis [ tas wae largely elated to serum ig levels and blood cecinophil conte [ 5 - 7 ], even ln noeasthenatic. patients,",2
PMC3576793_00004,0,"‘ the average of total serum ig levee was highest in 87 - year olds and decreased with age figure 3a ) 234, blood call eosinophil count. the blood cll eosinophil unt was alto compared between groups. the ensngphil ‘ all proportion was 43 £ 04 % in pallens senstized only to spring poles, whe it was sigeanly higher ( 57 = 04 % ) it pleats senaltized to bath perenoal allergens and",3
PMC3576793_00004,1,"sping pollens ( p = 00145, mann - whitney u text ) ( igure 20 ), { able 2 ). the blood cell eosinophil count showed the same reductive tendency ( figure 30 ).",3
PMC3576793_00004,0,"basinopils counts ( oo7 ). even in nonast mane ppanents : frway responsiveness ( assesed using methacholine [ 8 is increased in some eases of allergic shunts ndieatng an ‘ ncreased risk for asthma [ 9 - h ] sensitization o cat dandr uf shust mite cockroach, and ragweed ian important predictor ‘ fara bypertesponsivenes [ 2 airway hypetesponiv ‘ ess atong",4
PMC3576793_00004,1,"##ly related to elevated ttl ser ig level ‘ en in anymplomalic pallens [ 5, 1 ]. in other words, total serum ig level te considered an indcator of probable airway iyperesponsiveness oe asthma in our study tot serum [ ge level and blood cell eosinophil counts wer sigacaty slevated in patients senstzed to both spring pollens and perennial allergens, as compared to pallet senalized only ‘ os",4
PMC3576793_00004,2,"##pring plles. therefore paint sensitized bth sping pollens and perennial allergens might be at gretter risk of eeveoping arwsybyperresponsivenersor asthma ‘ comnpzed toads, fewer children were sensitized only to spring pollens. most children ( approximately 80 % ) had",4
