# Imports

In [None]:
%reset -f

In [None]:
import io

from pdfminer.high_level import extract_text
import fitz # from pymupdf
import PIL.Image
import tabula

from transformers import pipeline

from tqdm import tqdm
import json

In [None]:
FILENAME1 = 'Data/file1.pdf'
FILENAME2 = 'Data/file2.pdf'

RESULTS1 = 'Results/file1'
RESULTS2 = 'Results/file2'

# Reading contents

## OCR (extract all text from pdf)

In [None]:
text1 = extract_text(FILENAME1)
text1

In [None]:
text2 = extract_text(FILENAME2)
text2

## Extract tables

In [None]:
dfs1 = tabula.read_pdf(FILENAME1, pages='all')
print(f'Extracted a total of {len(dfs1)} tables.')

In [None]:
dfs2 = tabula.read_pdf(FILENAME2, pages='all')
print(f'Extracted a total of {len(dfs2)} tables.')

In [None]:
dfs2[0]

# Run dlite

In [None]:
pipe = pipeline("text-generation", model="aisquared/dlite-v2-124m", trust_remote_code=True)
pipe

In [None]:
pipe('Hello :)')

## On text from `file1`.

In [None]:
def split_string_into_batches(input_string, batch_length):
    batches = []
    for i in range(0, len(input_string), batch_length):
        batch = input_string[i:i+batch_length]
        batches.append(batch)
    return batches


batch_length = 128
result = split_string_into_batches(text1, batch_length)

results1 = {}
for i, batch in tqdm(enumerate(result)):
    results1[f'prompt_{i:003}'] = batch
    results1[f'response_{i:003}'] = pipe(batch)
results1

## On text from `file2`.

In [None]:
result = split_string_into_batches(text2, batch_length)

results2_text = {}
for i, batch in tqdm(enumerate(result)):
    results2_text[f'prompt_{i:003}'] = batch
    results2_text[f'response_{i:003}'] = pipe(batch)
results2_text

In [None]:
results2_text

# Save to files

In [None]:
with open(f'{RESULTS1}/results.json', 'w') as f:
    json.dump(results1, f)

with open(f'{RESULTS1}/ocr.txt', 'w') as f:
    f.write(text1)

for idx, df in enumerate(dfs1):
    df.to_csv(f'{RESULTS1}/tables/table_{idx:02}.csv', index_label='Index')

In [None]:
with open(f'{RESULTS2}/results2_text.json', 'w') as f:
    json.dump(results2_text, f)

with open(f'{RESULTS2}/ocr.txt', 'w') as f:
    f.write(text1)

for idx, df in enumerate(dfs2):
    df.to_csv(f'{RESULTS2}/tables/table_{idx:02}.csv', index_label='Index')