Open this notebook in Google Colab : [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Riminder/hrflow-cookbook/blob/main/examples/%5BTagging%5D%20ocr_and_categorize_a_document.ipynb)

In [1]:
# Copyright 2023 HrFlow's AI Research Department. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

**Welcome to this Google Colaboratory tutorial!** This notebook is designed to help you tackle the challenge of categorizing HR documents by using our ocr and tagging APIs.

Here’s a quick overview of the notebook's workflow:

* 🛠 Upload File: Upload the file you want to categorized.
* 👷 Ocr the document: apply hrflow ocr api's to ocr the document.
* 📝 Tag the text of the document: apply hrflow dynamic tagging to categorize the document.

In [None]:
!pip install --quiet requests tqdm hrflow

In [2]:
import requests
import os
import requests
from getpass import getpass
from tqdm import tqdm
from hrflow import Hrflow
from google.colab import files 

failed = []
OCR_URL = "https://api.hrflow.ai/v1/text/ocr"
LABELS = ["resume", "coverletter", "references", "diploma", "certificate", "permit", "license", "passport"]
CONTEXT = "Given the following text extracted via OCR, categorize it into one of the provided categories. If the text doesn't fit any category, return 'Other'."

API_SECRET = getpass("YOUR_API_SECRET")
API_USER = getpass("USER@EMAIL.DOMAIN")

In [None]:
def upload_files_colab():
    print("Please select the files to upload:")
    uploaded_files = files.upload()  
    uploaded_file_paths = []

    for file_name in uploaded_files.keys():
        file_path = os.path.join("/content", file_name)
        with open(file_path, "wb") as f:
            f.write(uploaded_files[file_name])
        uploaded_file_paths.append(file_path)
    
    return uploaded_file_paths

In [None]:
# Upload files and store their paths
files_list = upload_files_colab()
print(f"Uploaded files: {files_list}")

In [None]:
def tagger(api_secret,api_user,text,labels,context=""):
    url = "https://api.hrflow.ai/v1/text/tagging"

    payload = {
        "algorithm_key": "tagger-hrflow-dynamic",
        "texts": [text],
        "dynamic_labels": labels,
        "top_n": 1,
        "dynamic_context": context
    }
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "X-API-KEY": api_secret,
        "X-USER-EMAIL": api_user,
    }

    response = requests.post(url, json=payload, headers=headers)
    if response.status_code != 200:
        print(f"HTTP error: {response.status_code}")
        return None

    response_data = response.json()
    if (len(response_data["data"]) > 0 and isinstance(response_data["data"][0], dict) and 
    "tags" in response_data["data"][0] and isinstance(response_data["data"][0]["tags"], list) and 
    len(response_data["data"][0]["tags"]) > 0):
        tag = response_data["data"][0]["tags"][0]
    else:
        tag = None  
    return tag

In [None]:
for file_path in tqdm(files_list):
    filename = os.path.basename(file_path)
    try:
        with open(file_path, "rb") as file:
            payload = {}
            files = [('file', (filename, file, 'application/pdf'))]
            
            response = requests.request("POST", OCR_URL, headers={
                'X-USER-EMAIL': API_USER,
                'X-API-KEY': API_SECRET
            }, data=payload, files=files)
        
        text = response.json().get("data", {}).get("text", "")
        tag = tagger(API_SECRET, API_USER, text, LABELS, context=CONTEXT)
        print(f"File : {file_path} -> Tag : {tag if tag else 'No categorization'}")
    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        failed.append(file_path)


In [None]:
if failed:
    print("\nFiles that failed to process:")
    for f in failed:
        print(f)
else:
    print("\nAll files were processed successfully!")