# Get multiple transcriptions
Assumes the image directory only contains image files

In [None]:
import pandas as pd
from pathlib import Path 
from PIL import Image
import pytesseract


### CHANGE THESE ###
image_dir = "insert_image_dir"
model_name = "insert_model_name"
### ------------ ###

image_dir = Path(image_dir)
transcriptions = {"model_name": [], "image": [], "transcription": []}

for img in image_dir.iterdir():
    transcription = pytesseract.image_to_string(Image.open(img), lang=model_name)
    transcriptions["image"].append(img.name)
    transcriptions["model_name"].append(model_name)
    transcriptions["transcription"].append(transcription)

df = pd.DataFrame(transcriptions)
df.to_csv(f"output/{image_dir.name}_{model_name}_transcriptions.csv", index=False)
df

# Get multiple transcriptions and ground truth
Assumes the ground truth directory contains pairwise text and image files with the same filename (except for the file extension) 

In [None]:
import pandas as pd
from pathlib import Path 
from PIL import Image
import pytesseract
from tqdm import tqdm

### CHANGE THESE ###
ground_truth_dir = "sample_data/"
image_file_ext = ".tif"
text_file_ext = ".txt"
model_name = "smx_50000"
### ------------ ###

ground_truth_dir = Path(ground_truth_dir)
transcriptions = {"model_name": [], "image": [], "transcription": [], "ground_truth": []}

img_files = sorted([e for e in ground_truth_dir.iterdir() if e.suffix == image_file_ext])
txt_files = sorted([e for e in ground_truth_dir.iterdir() if e.suffix == text_file_ext])

for img_file, txt_file in tqdm(zip(img_files, txt_files), total=len(img_files)):
    transcription = pytesseract.image_to_string(Image.open(img_file), lang=model_name)
    transcriptions["image"].append(img_file.name)
    transcriptions["model_name"].append(model_name)
    transcriptions["transcription"].append(transcription)
    transcriptions["ground_truth"].append(txt_file.read_text())


df = pd.DataFrame(transcriptions)
df.to_csv(f"output/{ground_truth_dir.name}_{model_name}_transcriptions_gt.csv", index=False)
df