# Step 1 OCR 

## faltten

In [33]:
def flatten(nested_list):
    flat_list = []
    for item in nested_list:
        if isinstance(item, list):
            flat_list.extend(flatten(item))
        else:
            flat_list.append(str(item))
    return flat_list

## PDF or Image

In [34]:
import mimetypes

def check_file_type_by_mime(file_path):
    mime_type, _ = mimetypes.guess_type(file_path)
    if mime_type == 'application/pdf':
        return 'PDF'
    elif mime_type == 'image/jpeg':
        return 'JPG'
    else:
        return 'Unknown'


In [35]:
from pdf2image import convert_from_bytes
from paddleocr import PaddleOCR
import numpy as np
from PIL import Image
import json
import base64
from PyPDF2 import PdfReader
import io
#import PyMuPDF
import fitz

def imageprocess(pdf_bytes):
    #pdf_base64 = data['pdf_data']

    # 解码Base64字符串
    #pdf_bytes = base64.b64decode(pdf_base64)

    #with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_doc:
    #    for page in pdf_doc:
    #        pdf_text += page.get_text()
    poppler_path = r'D:\pylib\poppler-24.07.0\Library\bin'
    #file_path = r''
    #if check_file_type_by_mime(file_path) == 'PDF':
    images = convert_from_bytes(pdf_bytes)
    #elif check_file_type_by_mime(file_path) == 'JPG':
    #    images = []
    #    images.append(Image.open(file_path))
    #else:
    #    FileNotFoundError
    ocr = PaddleOCR(lang='ch',  use_gpu=False)

    text = ''
    for image in images:
        image_np = np.array(image)
        result = ocr.ocr(image_np, cls=True)
        for line in result[0]:
            line_text = line[1][0]
            print(f"line_text: {line_text}, type: {type(line_text)}")
            if isinstance(line_text, list):
                line_text = ''.join(flatten(line_text))
            text += line_text + '\n'
    text = text.strip()
    text = text.replace('\r', ' ').replace('\n', ' ')
    text = text.lower()
    text = ' '.join(text.split())  # 去除多余空格
    return text

## Step 2 Keyword extraction
keyword to be extracted:
degree
major
skills
project experience
school
preference
hobby
salary

In [36]:
# text cleaning

## Name

In [37]:
import spacy, re

def extract_user_name(text: str) -> str:
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name = ent.text.strip()
                name = re.sub(r'\S+@\S+', '', name)
                name = re.sub(r'\+?\d[\d\s\-()]{7,}\d', '', name)
                name = re.sub(r'[^A-Za-z\s\-\'.]', '', name)
                name = re.sub(r'email', '', name)
                name = ' '.join(name.split())
                return name

    return ""


## Email & phone number

In [38]:
import re

# 提取邮箱
def extract_email_address(text: str) -> str:
    email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    emails = re.findall(email_pattern, text)
    email = emails[0] if emails else None
    return email

# 提取手机号码
def extract_phone_number(text: str) -> str:
    phone_pattern = r"(?:\(?\+65\)?[\s-]?)?(?:8|9)\d{3}[\s-]?\d{4}"
    phones = re.findall(phone_pattern, text)
    phone = phones[0] if phones else None
    return phone


## Degree

In [39]:
import re
from typing import List, Dict

degree_mapping = {
    'Primary/elementary school': ['primary school', 'elementary school'],
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': ['secondary school', 'high school', 'gymnasium', 'realschule'],
    'Some college/university study without earning a degree': [ 'university study', 'no degree'],
    'Associate degree': ['associate degree', 'associate'],
    'Bachelor’s degree (BA, BS, B.Eng., etc.)': ['bachelor', 'ba', 'bs', 'b.sc', 'beng', 'b.eng', 'bachelor’s degree'],
    'Master’s degree (MA, MS, M.Eng., MBA, etc.)': ['master', 'ma', 'ms', 'm.sc', 'msc', 'meng', 'm.eng', 'mba', 'master’s degree','mtech'],
    'Professional degree (JD, MD, etc.)': ['professional degree', 'jd', 'md', 'juris doctor', 'doctor of medicine'],
    'Other doctoral degree (Ph.D, Ed.D., etc.)': ['phd', 'ph.d', 'edd', 'ed.d', 'doctorate', 'doctoral degree']
}

degree_levels = {
    'Primary/elementary school': 0,
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 1,
    'Some college/university study without earning a degree': 2,
    'Associate degree': 3,
    'Bachelor’s degree (BA, BS, B.Eng., etc.)': 4,
    'Master’s degree (MA, MS, M.Eng., MBA, etc.)': 5,
    'Professional degree (JD, MD, etc.)': 6,
    'Other doctoral degree (Ph.D, Ed.D., etc.)': 7
}
def extract_degrees(text: str) -> List[str]:
    degrees_found = []
    text_lower = text.lower()
    for degree_full, keywords in degree_mapping.items():
        for keyword in keywords:
            pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
            if re.search(pattern, text_lower):
                degrees_found.append(degree_full)
                break
    return list(set(degrees_found)) 


def get_highest_degree(degrees: List[str]) -> str:
    if not degrees:
        return ""
    degrees_sorted = sorted(degrees, key=lambda x: degree_levels.get(x, -1), reverse=True)
    return degrees_sorted[0]


## Major

In [40]:
import re
from typing import List, Dict
from collections import Counter

major_mapping = {
    'Computer science, computer engineering, or software engineering': [
        'computer science', 'computer engineering', 'software engineering', 'cs', 'se','artificial intelligence', 'AI','artificial intelligence system'
    ],
    'Another engineering discipline (ex. civil, electrical, mechanical)': [
        'civil engineering', 'electrical engineering', 'mechanical engineering',
        'chemical engineering', 'biomedical engineering', 'aerospace engineering',
        'environmental engineering', 'industrial engineering'
    ],
    'Information systems, information technology, or system administration': [
        'information systems', 'information technology', 'system administration', 
        'systems engineering', 'network administration', 'system admin'
    ],
    'Mathematics or statistics': [
        'mathematics',  'statistics', 'applied mathematics', 'statistical science'
    ],
    'Web development or web design': [
        'web development', 'web design', 'frontend development', 'ui/ux design',
        'web programming', 'web engineering'
    ],
    'A natural science (ex. biology, chemistry, physics)': [
        'biology', 'chemistry', 'physics', 'geology', 'environmental science',
        'astronomy', 'earth sciences', 'natural sciences', 'biochemistry'
    ],
    'A social science (ex. anthropology, psychology, political science)': [
        'anthropology', 'psychology', 'political science', 'sociology', 'economics',
        'human geography', 'criminology', 'social sciences', 'international relations'
    ],
    'A humanities discipline (ex. literature, history, philosophy)': [
        'literature', 'history', 'philosophy', 'languages', 'religious studies',
        'cultural studies', 'art history', 'humanities'
    ],
    'A business discipline (ex. accounting, finance, marketing)': [
        'business', 'accounting', 'finance', 'marketing', 'management',
        'business administration', 'entrepreneurship', 'business management', 'economics'
    ],
    'Fine arts or performing arts (ex. graphic design, music, studio art)': [
        'graphic design', 'music', 'studio art', 'fine arts', 'performing arts',
        'theater', 'dance', 'film', 'photography', 'design', 'visual arts'
    ],
    'A health science (ex. nursing, pharmacy, radiology)': [
        'nursing', 'pharmacy', 'radiology', 'medicine', 'public health',
        'health sciences', 'occupational therapy', 'dentistry', 'veterinary medicine'
    ],
    'I never declared a major': [
        'undeclared', 'undecided', 'general studies', 'liberal arts', 'no major'
    ]
}


def extract_major(text: str, highest_degree: str) -> str:
    degree_keywords = degree_mapping.get(highest_degree, [])
    if not degree_keywords:
        return ""
    
    education_pattern = re.compile(r'(Education|Educational Background|Education Background|Qualifications|educationbackground)[:\s]*(.*?)(?=\n[A-Z][a-z]+:|\n[A-Z]{2,}|$)', re.DOTALL | re.IGNORECASE)
    education_match = education_pattern.search(text)
    if education_match:
        education_text = education_match.group(2)
    else:
        education_text = text 
    
    lines = re.split(r'\n|--|·|\|', education_text)
    major_candidates = []
    degree_keywords_lower = [k.lower() for k in degree_keywords]
    for line in lines:
        line_lower = line.lower()
        for degree_keyword in degree_keywords_lower:
            if degree_keyword in line_lower:
                major_pattern = re.compile(r'(in|of)\s+(.+?)(?=\s\d{2}\.\d{4}|$)', re.IGNORECASE)
                major_match = major_pattern.search(line)
                if major_match:
                    major_candidate = major_match.group(2).strip()
                    major_candidates.append(major_candidate)
                else:
                    parts = line_lower.split(degree_keyword)
                    if len(parts) > 1:
                        potential_major = parts[1].strip(' .,-')
                        major_candidates.append(potential_major)
                break 

    mapped_majors = []
    for major_candidate in major_candidates:
        for major_field, keywords in major_mapping.items():
            for keyword in keywords:
                if keyword.lower() in major_candidate.lower():
                    mapped_majors.append(major_field)
                    break
    if mapped_majors:
        major_counter = Counter(mapped_majors)
        most_common_major = major_counter.most_common(1)[0][0]
        return most_common_major
    else:
        if major_candidates:
            return major_candidates[0]
        else:
            return ""

## Skills

In [41]:
import spacy
from spacy.matcher import PhraseMatcher
from collections import OrderedDict
nlp = spacy.load('en_core_web_sm')

predefined_skills = ['AWS',
 'Amazon DynamoDB',
 'Amazon Echo',
 'Amazon RDS/Aurora',
 'Amazon Redshift',
 'Android',
 'Android Studio',
 'Apache HBase',
 'Apache Hive',
 'Apple Watch or Apple TV',
 'Arduino',
 'Assembly',
 'Atom',
 'Azure',
 'BSD/Unix',
 'Bash/Shell',
 'C',
 'C#',
 'C++',
 'CSS',
 'Cassandra',
 'Clojure',
 'Cobol',
 'CoffeeScript',
 'Delphi/Object Pascal',
 'Drupal',
 'ESP8266',
 'Eclipse',
 'Elasticsearch',
 'Emacs',
 'Erlang',
 'Firebase',
 'Go',
 'Google BigQuery',
 'Google Cloud Platform/App Engine',
 'Google Cloud Storage',
 'Google Home',
 'Groovy',
 'HTML',
 'Haskell',
 'Heroku',
 'IBM Cloud or Watson',
 'IBM Db2',
 'IPython / Jupyter',
 'IntelliJ',
 'Java',
 'JavaScript',
 'Julia',
 'Kotlin',
 'Linux',
 'Linux-based',
 'Lua',
 'Mac OS',
 'MacOS',
 'Mainframe',
 'MariaDB',
 'Matlab',
 'Memcached',
 'Microsoft Azure (Tables, CosmosDB, SQL, etc)',
 'MongoDB',
 'MySQL',
 'Neo4j',
 'NetBeans',
 'Notepad++',
 'Objective-C',
 'Ocaml',
 'Oracle',
 'PHP',
 'PHPStorm',
 'Perl',
 'PostgreSQL',
 'PyCharm',
 'Python',
 'R',
 'RStudio',
 'Raspberry Pi',
 'Redis',
 'Ruby',
 'Rust',
 'SQL',
 'SQL Server',
 'SQLite',
 'Salesforce',
 'Scala',
 'Serverless',
 'SharePoint',
 'Sublime Text',
 'Swift',
 'TextMate',
 'TypeScript',
 'VB.NET',
 'VBA',
 'Vim',
 'Visual Basic 6',
 'Visual Studio',
 'Visual Studio Code',
 'Windows',
 'Windows Desktop or Server',
 'Windows Phone',
 'WordPress',
 'Xcode',
 'iOS']

import spacy
from spacy.matcher import PhraseMatcher
import re
from typing import List

nlp = spacy.load('en_core_web_sm')

def extract_skills_section(text: str) -> str:
    # 提取 "Skills" 部分的内容
    pattern = re.compile(r'(Skills|Technical Skills|Skill Set)[:\s]*(.*?)(?=\n[A-Z][a-z]+:|\n[A-Z]{2,}|\Z)', re.DOTALL | re.IGNORECASE)
    match = pattern.search(text)
    if match:
        skills_text = match.group(2)
        return skills_text.strip()
    return ""

def extract_skills(text: str, skills_list: list[str]) -> str:
    # First, extract the "Skills" section
    skills_section = extract_skills_section(text)
    if not skills_section:
        return ""

    # Perform skill matching in the "Skills" section
    doc = nlp(skills_section)
    matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

    # Create skill patterns, ensuring skill names are in lowercase
    patterns = [nlp.make_doc(skill.lower()) for skill in skills_list]
    matcher.add("SKILLS", patterns)

    matches = matcher(doc)
    skills_found = []

    for match_id, start, end in matches:
        span = doc[start:end]
        skills_found.append(span.text)

    # Remove duplicates while preserving order
    skills_found = list(OrderedDict.fromkeys(skills_found))

    # Join the skills into a comma-separated string
    return ', '.join(skills_found)


## Nationality

In [42]:
def extract_hometown(text: str) -> str:
    pattern = re.compile(r'(Nationality)[:\s]*(.*?)\n', re.IGNORECASE)
    match = pattern.search(text)
    if match:
        nationality = match.group(2).strip()
        return nationality

    countries = [
        'China', 'United States', 'Singapore', 'India', 'Germany', 'France', 'Japan',
        'United Kingdom', 'Canada', 'Australia', 'Italy', 'Spain', 'Russia', 'Brazil',
        'South Korea', 'Netherlands', 'Switzerland', 'Sweden', 'Norway', 'Denmark',
        'Finland', 'Belgium', 'Austria', 'Ireland', 'New Zealand', 'Mexico', 'Argentina',
    ]
    for country in countries:
        if re.search(r'\b' + re.escape(country) + r'\b', text, re.IGNORECASE):
            return country
    return ""


## Project Experience

In [43]:
from datetime import datetime

experience_ranges = [
    ('0-2 years', (0, 2)),
    ('3-5 years', (3, 5)),
    ('6-8 years', (6, 8)),
    ('9-11 years', (9, 11)),
    ('12-14 years', (12, 14)),
    ('15-17 years', (15, 17)),
    ('18-20 years', (18, 20)),
    ('21-23 years', (21, 23)),
    ('24-26 years', (24, 26)),
    ('27-29 years', (27, 29)),
    ('30 or more years', (30, float('inf'))),
]

from datetime import datetime

def extract_years_of_experience(text: str) -> str:
    # 寻找工作经历部分
    experience_pattern = re.compile(
        r'(Experience|Work Experience|Employment History|Professional Experience)[:\s]*(.*?)(?=\n[A-Z][a-z]+:|\n[A-Z]{2,}|$)',
        re.DOTALL | re.IGNORECASE)
    experience_match = experience_pattern.search(text)
    total_years = 0
    current_year = datetime.now().year
    if experience_match:
        experience_text = experience_match.group(2)
        # 提取所有年份和日期范围
        date_ranges = re.findall(r'(\d{4})\s*[-–]\s*(\d{4}|Present|Now|Current)', experience_text, re.IGNORECASE)
        for start, end in date_ranges:
            start_year = int(start)
            if end.lower() in ['present', 'now', 'current']:
                end_year = current_year
            else:
                end_year = int(end)
            total_years += end_year - start_year
    else:
        # 如果未找到工作经历部分，尝试根据毕业年份和当前年份计算
        graduation_years = re.findall(r'(\d{4})', text)
        graduation_years = [int(year) for year in graduation_years if int(year) < current_year]
        if graduation_years:
            earliest_year = min(graduation_years)
            total_years = current_year - earliest_year
        else:
            # 如果无法计算工作年限，返回 'others'
            return 'others'

    # 将 total_years 映射到对应的工作年限区间
    for range_label, (start, end) in experience_ranges:
        if start <= total_years <= end:
            return range_label
    # 如果 total_years 不在任何区间，返回 'others'
    return 'others'



## Desired job

In [44]:
import re
from typing import List, Dict
from collections import Counter

# 定义职位映射，将关键词映射到完整的职位类别
desired_job_mapping = {
    'Back-end developer': ['back-end developer', 'backend developer', 'back end developer', 'server-side developer'],
    'Front-end developer': ['front-end developer', 'frontend developer', 'front end developer', 'client-side developer'],
    'Full-stack developer': ['full-stack developer', 'full stack developer', 'fullstack developer'],
    'Mobile developer': ['mobile developer', 'ios developer', 'android developer', 'mobile app developer'],
    'Data scientist or machine learning specialist': ['data scientist', 'machine learning specialist', 'ml engineer', 'machine learning engineer', 'ai engineer', 'deep learning specialist'],
    'Data or business analyst': ['data analyst', 'business analyst', 'data analytics', 'business intelligence analyst'],
    'Database administrator': ['database administrator', 'db administrator', 'dba', 'database admin'],
    'DevOps specialist': ['devops specialist', 'devops engineer', 'site reliability engineer', 'sre', 'infrastructure engineer'],
    'System administrator': ['system administrator', 'sysadmin', 'systems administrator', 'it administrator'],
    'Embedded applications or devices developer': ['embedded developer', 'embedded systems engineer', 'firmware engineer'],
    'Desktop or enterprise applications developer': ['desktop developer', 'enterprise applications developer', 'windows developer', 'mac developer'],
    'QA or test developer': ['qa engineer', 'test developer', 'quality assurance engineer', 'software tester'],
    'Game or graphics developer': ['game developer', 'graphics developer', 'game programmer', 'unity developer', 'unreal developer'],
    'Designer': ['designer', 'ui designer', 'ux designer', 'graphic designer', 'product designer'],
    'Product manager': ['product manager', 'product owner', 'product lead'],
    'Engineering manager': ['engineering manager', 'technical manager', 'development manager'],
    'Marketing or sales professional': ['marketing professional', 'sales professional', 'marketing manager', 'sales manager', 'business development manager'],
    'Educator or academic researcher': ['educator', 'teacher', 'lecturer', 'professor', 'academic researcher', 'research scientist'],
    'C-suite executive (CEO, CTO, etc.)': ['ceo', 'cto', 'cfo', 'coo', 'chief executive officer', 'chief technology officer', 'chief financial officer', 'chief operating officer', 'c-suite executive'],
}

def extract_desired_job(text: str) -> str:
    pattern = re.compile(r'(Desired Position|Objective|Career Objective|Desired Job|desiredjob|求职意向|期望职位)[:\s]*(.*?)(?=\n[A-Z][a-z]+:|\n[A-Z]{2,}|$)', re.DOTALL | re.IGNORECASE)
    match = pattern.search(text)
    if match:
        desired_job_text = match.group(2).strip()
    else:

        desired_job_text = text

    desired_job_candidates = []
    for job_field, keywords in desired_job_mapping.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', desired_job_text.lower()):
                desired_job_candidates.append(job_field)
                break 

    if desired_job_candidates:
        job_counter = Counter(desired_job_candidates)
        most_common_job = job_counter.most_common(1)[0][0]
        return most_common_job
    else:
        return ""


## text extraction

In [45]:
def extract_resume_fields(text: str) -> dict[str, any]:
    return {
        'userName': extract_user_name(text),
        'phoneNumber': extract_phone_number(text),
        'emailAddress': extract_email_address(text),
        'degree': get_highest_degree(extract_degrees(text)),
        'major': extract_major(text,get_highest_degree(extract_degrees(text))),
        'skills': extract_skills(text,predefined_skills),
        'yearsCoding': extract_years_of_experience(text),
        'hometown': extract_hometown(text),
        'desiredJob': extract_desired_job(text)
    }


In [48]:
from flask import Flask, jsonify, request
import fitz
app = Flask(__name__)

@app.route('/return_info', methods=['POST'])
def return_info():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400
    file = request.files['file']
    pdf_bytes = file.read()
    text = imageprocess(pdf_bytes)
    usrinfo = extract_resume_fields(text)
    return jsonify(usrinfo)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5001)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://192.168.146.140:5001
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://192.168.146.140:5001
Press CTRL+C to quit
[2024-10-26 16:40:16,771] [    INFO] _internal.py:97 - [33mPress CTRL+C to quit[0m


[2024/10/26 16:43:31] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 16:43:41] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 16:43:41,616] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 16:43:41] "POST /return_info HTTP/1.1" 200 -


[2024/10/26 16:43:55] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 16:44:05] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 16:44:05,287] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 16:44:05] "POST /return_info HTTP/1.1" 200 -


[2024/10/26 16:50:04] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 16:50:12] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 16:50:12,700] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 16:50:12] "POST /return_info HTTP/1.1" 200 -


[2024/10/26 16:52:54] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 16:53:02] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 16:53:02,731] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 16:53:02] "POST /return_info HTTP/1.1" 200 -


[2024/10/26 16:55:00] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 16:55:08] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 16:55:08,976] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 16:55:08] "POST /return_info HTTP/1.1" 200 -


[2024/10/26 16:59:09] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 16:59:27] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 16:59:27,125] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 16:59:27] "POST /return_info HTTP/1.1" 200 -


[2024/10/26 17:01:35] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 17:01:53] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 17:01:53,216] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 17:01:53] "POST /return_info HTTP/1.1" 200 -


[2024/10/26 17:02:32] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\shidi/.paddleocr/whl\\det\\ch\\ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\shidi/.paddleocr/whl\\rec\\ch\\ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

192.168.146.170 - - [26/Oct/2024 17:02:50] "POST /return_info HTTP/1.1" 200 -
[2024-10-26 17:02:50,697] [    INFO] _internal.py:97 - 192.168.146.170 - - [26/Oct/2024 17:02:50] "POST /return_info HTTP/1.1" 200 -
