# Document Classification Pipeline

In [1]:
import psutil
import os
from pytictoc import TicToc
import pandas as pd
from documentProcessor import processAllDocuments, getLogger, fetchFiles

  from tqdm.autonotebook import tqdm, trange


In [2]:
logger = getLogger()
t = TicToc()
core_constraint = [2, 3, 4, 5, 6, 7]
psutil.Process(os.getpid()).cpu_affinity(core_constraint)
logger.info(f"INITIALIZING DOCUMENT PROCESSING PIPELINE >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
logger.debug(f"CPU affinity set to cores {core_constraint}")

2024-08-13 16:00:01,607 - INFO - INITIALIZING DOCUMENT PROCESSING PIPELINE >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


In [3]:
fsExamples = 'fewShot_old.csv'
folder = 'examples' # Set the folder to process
directory = 'data/' + folder + '/'
tif_files = fetchFiles(directory)

2024-08-13 16:00:01,634 - INFO - Found 27 TIF files in directory: data/examples/


In [4]:
t.tic()
logger.info(f'Starting document processing for {len(tif_files)} files in {directory}')
# ----------------------------------------------------------------
results_df = processAllDocuments(tif_files, fsExamples, batch_size=20, force_reload=False)
# ----------------------------------------------------------------
elapsed_time = t.tocvalue()
logger.info(f'Finished document processing for {len(tif_files)} files in {directory}. Time taken: {elapsed_time:.2f} seconds.')

2024-08-13 16:00:01,657 - INFO - Starting document processing for 27 files in data/examples/
2024-08-13 16:00:03,374 - INFO - Few-shot examples loaded from cache
2024-08-13 16:00:49,418 - INFO - Starting to process 27 documents in 2 batches
2024-08-13 16:00:49,421 - INFO - Processing batch 1/2
2024-08-13 16:00:49,422 - INFO - Processing document data/examples/00C952C1.TIF
2024-08-13 16:00:59,684 - INFO - File 'data/examples/00C952C1.TIF': Processing complete (Status: success)
2024-08-13 16:00:59,686 - INFO - Processing document data/examples/00C97BD4.TIF
2024-08-13 16:01:05,236 - INFO - File 'data/examples/00C97BD4.TIF': Processing complete (Status: success)
2024-08-13 16:01:05,238 - INFO - Processing document data/examples/00C97C1A.TIF
2024-08-13 16:01:12,740 - INFO - File 'data/examples/00C97C1A.TIF': Processing complete (Status: success)
2024-08-13 16:01:12,742 - INFO - Processing document data/examples/00C99154.TIF
2024-08-13 16:01:43,490 - INFO - File 'data/examples/00C99154.TIF':


Available categories:
1. patient chart note
2. prescription request
3. provider certification
4. plan of care
5. return to work
6. lab results
7. discharge summary
8. progress note
9. answering service
10. prior authorization
11. spam
12. formal records request
13. Other (Write-in)

Reviewing document: data/examples/00C99154.TIF

Initial prediction: provider certification (Confidence: 0.55)
Invalid input. Please enter a valid category number or 'y' to accept the prediction
Invalid input. Please enter a valid category number or 'y' to accept the prediction
Invalid input. Please enter a valid category number or 'y' to accept the prediction

Reviewing document: data/examples/00C9ADF0.TIF

Initial prediction: patient chart note (Confidence: 0.66)

Reviewing document: data/examples/00C9B537.TIF

Initial prediction: lab results (Confidence: 0.68)

Reviewing document: data/examples/00C9C734.TIF

Initial prediction: return to work (Confidence: 0.67)

Reviewing document: data/examples/00CAB718

: 

: 

In [None]:
if 'error' in results_df['status'].values:
  print('\nError Analysis:')
  error_df = results_df[results_df['status'] == 'error']
  print(error_df['error_message'].value_counts())

  print('\nFiles with empty extracted text:')
  empty_text_files = error_df[error_df['error_message'] == 'ValueError: Extracted text is empty']
  print(empty_text_files['file_path'].tolist())

  print('\nFiles with other errors:')
  other_error_files = error_df[error_df['error_message'] != 'ValueError: Extracted text is empty']
  for _, row in other_error_files.iterrows():
    print(f'File: {row["file_path"]}')
    print(f'Error: {row["error_message"]}')
    print(f'Extracted text length: {row['text_length']}')
    print()

In [None]:
results_df.head(15)

In [None]:
results_df.value_counts('category')