In [2]:
import glob
import tqdm
import json
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import pandas as pd
import numpy as np

The metadata is saved in 2 files (for 2 corpuses) which has the following structure:
```
{
    "893872": [1, 168, 13, 1, 12, 2, 3, 26, 1, 6], 
    "890952": [151],...
}
```
The keys would be the name of documents and the values would be the number of pages of each sub-documents. We call this **gold** labels.
Having this **gold** labels, we need to convert to **one hot** vectors for further training and to save each page of sub documents to appropriate folder (1.0 for first page, 0.0 for not first pages)

The function ```gold_to_onehot``` below is used to convert the **gold** to **one hot**. The rule is as follow:

say we have gold like this ```[4,5,1]```, meaning that the document has 10 pages, containing 3 sub-documents that has 4, 5, and 1 page, respectively.
We will convert to one hot as follow:
```[4, 5, 1] -> [1,0,0,0,1,0,0,0,0,1]```

Specifically, first sub-doc has 4 pages, so its **one hot** will be ```1,0,0,0``` (1 for the first page, 0 for others)
same for 2 others.



In [5]:
def extract_doc_meta(json_file):
    with open(json_file, 'rb') as f:
        meta = json.load(f)
    return meta

def gold_to_onehot(original_indexes):
    all = np.array([])
    for i in original_indexes:
        temp = np.concatenate((np.ones((1)), np.zeros((i-1))))
        all = np.concatenate((all, temp))
    return list(all)

Having the **one hot** labels, we can use that for extracting every single page and save it to appropriate folder (1.0 for first pages, 0.0 for the others).

In [4]:
def extract_images(df):
    files = list(df["names"])
    labels = list(df["one_hot"])
    for f, label in tqdm.tqdm(zip(files, labels)):
        if len(label) >= 500:
            continue
        name = f.split('/')[-1][:-4]
        images = convert_from_path(f)
        image_count = 0
        for image, l in zip(images, label):
            filename = "images/{}/{}_p_{}.jpg".format(str(l), name, str(image_count))
            image.save(filename, 'JPEG')
            image_count += 1

we can extract text in each image using pytesseract.

In [3]:
def extract_text(path):
    filename = path
    text = str(((pytesseract.image_to_string(Image.open(filename)))))
    text = text.replace('-\n', '')  
    return text

The main program starts here
1. First we read the meta json file, extract one hot labels
2. Save the *name*, *pages* (original gold), and *one hot* to dataframes
3. Extract images
4. Extract text and save to dataframe

In [6]:
with open('corpus1/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json', 'rb') as f:
    meta = json.load(f)
names = ['corpus1/TrainTestSet/Trainset/data/' + name + "__concatenated.pdf" for name in meta.keys()]
pages = meta.values()
df_cp1 = pd.DataFrame({"names":names, "pages":pages})
df_cp1["one_hot"] = df_cp1["pages"].apply(gold_to_onehot)

In [7]:
with open('corpus2/TrainTestSet/Trainset/Doclengths_of_the_individual_docs_TRAIN.json', 'rb') as f:
    meta = json.load(f)
names = ['corpus2/TrainTestSet/Trainset/data/' + name + "__concatenated.pdf" for name in meta.keys()]
pages = meta.values()
df_cp2 = pd.DataFrame({"names":names, "pages":pages})
df_cp2["one_hot"] = df_cp2["pages"].apply(gold_to_onehot)

In [None]:
# Extract images
df = df_cp1.append(df_cp2)
extract_images(df)

In [None]:
# find list of images and its labels
data_list_0 = glob.glob('images/images/0.0/*.jpg')
data_list_1 = glob.glob('images/images/1.0/*.jpg')
data_list = data_list_0 + data_list_1
labels = list(np.concatenate((np.zeros((len(data_list_0))), np.ones((len(data_list_1))))))
df = pd.DataFrame({"paths": data_list, "labels": labels})
df.head()

In [None]:
# too many images, we need parallel processing
tqdm.tqdm.pandas()
from pandarallel import pandarallel
from pqdm.processes import pqdm

pandarallel.initialize()

Please note that we save the extracted images for "1.0" images and "0.0" images in separate csv files so we can use it for training more easily.

In [None]:
labels_1 = np.ones((len(data_list_1)))
df_1 = pd.DataFrame({"paths": data_list_1, "labels": labels_1})

In [None]:
# extract text 
df_1["text"] = df_1["paths"][:].parallel_apply(extract_text)

In [None]:
labels_0 = np.ones((len(data_list_0)))
df_0 = pd.DataFrame({"paths": data_list_0, "labels": labels_0})

In [None]:
df_0["text"] = df_0["paths"][:].parallel_apply(extract_text)

In [None]:
# save to csv files
df_0.to_csv('data/0.0.csv', index=False)
df_1.to_csv('data/1.0.csv', index=False)